#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ 提取RMO概述总结文件内容,用于提炼价值主张和优势 """ import os import sys from pathlib import Path try: from docx import Document HAS_DOCX = True except ImportError: HAS_DOCX = False print("警告: python-docx 未安装,无法读取 .docx 文件") try: from pptx import Presentation HAS_PPTX = True except ImportError: HAS_PPTX = False print("警告: python-pptx 未安装,无法读取 .pptx 文件") def extract_docx(file_path): """提取DOCX文件文本内容""" if not HAS_DOCX: return None try: doc = Document(file_path) paragraphs = [p.text for p in doc.paragraphs if p.text.strip()] return '\n'.join(paragraphs) except Exception as e: return f"读取错误: {e}" def extract_pptx(file_path): """提取PPTX文件文本内容""" if not HAS_PPTX: return None try: prs = Presentation(file_path) slides_text = [] for i, slide in enumerate(prs.slides, 1): slide_text = [] for shape in slide.shapes: if hasattr(shape, "text") and shape.text.strip(): slide_text.append(shape.text.strip()) if slide_text: slides_text.append(f"--- 幻灯片 {i} ---\n" + "\n".join(slide_text)) return "\n\n".join(slides_text) except Exception as e: return f"读取错误: {e}" def main(): base_dir = Path(r"d:\SoftwarePrj\RMO网站\资源中心\关于RMO介绍\RMO概述\概述总结") files_to_read = [ ("方案制定/一站式临床试验风险管理:RMO.docx", extract_docx), ("构建中国药械风险能力责任保障体系(阶段性汇报)【20250505】.pptx", extract_pptx), ("华泰_构建生命科学领域责任保险新质生产力202407.pptx", extract_pptx), ("临研安生命科学保险技术支持服务介绍 202505.pptx", extract_pptx), ("MAH持有人责任风险管理方案-网站建设.pptx", extract_pptx), ] output_lines = [] output_lines.append("=" * 80) output_lines.append("RMO概述总结文件内容提取") output_lines.append("=" * 80) output_lines.append("") for file_rel_path, extract_func in files_to_read: file_path = base_dir / file_rel_path if not file_path.exists(): output_lines.append(f"文件不存在: {file_rel_path}") output_lines.append("") continue output_lines.append(f"\n{'='*80}") output_lines.append(f"文件: {file_rel_path}") output_lines.append(f"{'='*80}\n") content = extract_func(file_path) if content: output_lines.append(content) else: output_lines.append("无法提取内容(可能缺少依赖库)") output_lines.append("\n") # 输出到文件 output_file = base_dir / "提取内容_概述总结.txt" with open(output_file, 'w', encoding='utf-8') as f: f.write('\n'.join(output_lines)) print(f"内容已提取到: {output_file}") print("\n提取的内容预览:") print('\n'.join(output_lines[:50])) if __name__ == "__main__": main()