RMO-Front/extract_rmo_content.py

100 lines
3.2 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
提取RMO概述总结文件内容用于提炼价值主张和优势
"""
import os
import sys
from pathlib import Path
try:
from docx import Document
HAS_DOCX = True
except ImportError:
HAS_DOCX = False
print("警告: python-docx 未安装,无法读取 .docx 文件")
try:
from pptx import Presentation
HAS_PPTX = True
except ImportError:
HAS_PPTX = False
print("警告: python-pptx 未安装,无法读取 .pptx 文件")
def extract_docx(file_path):
"""提取DOCX文件文本内容"""
if not HAS_DOCX:
return None
try:
doc = Document(file_path)
paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
return '\n'.join(paragraphs)
except Exception as e:
return f"读取错误: {e}"
def extract_pptx(file_path):
"""提取PPTX文件文本内容"""
if not HAS_PPTX:
return None
try:
prs = Presentation(file_path)
slides_text = []
for i, slide in enumerate(prs.slides, 1):
slide_text = []
for shape in slide.shapes:
if hasattr(shape, "text") and shape.text.strip():
slide_text.append(shape.text.strip())
if slide_text:
slides_text.append(f"--- 幻灯片 {i} ---\n" + "\n".join(slide_text))
return "\n\n".join(slides_text)
except Exception as e:
return f"读取错误: {e}"
def main():
base_dir = Path(r"d:\SoftwarePrj\RMO网站\资源中心\关于RMO介绍\RMO概述\概述总结")
files_to_read = [
("方案制定/一站式临床试验风险管理RMO.docx", extract_docx),
("构建中国药械风险能力责任保障体系(阶段性汇报)【20250505】.pptx", extract_pptx),
("华泰_构建生命科学领域责任保险新质生产力202407.pptx", extract_pptx),
("临研安生命科学保险技术支持服务介绍 202505.pptx", extract_pptx),
("MAH持有人责任风险管理方案-网站建设.pptx", extract_pptx),
]
output_lines = []
output_lines.append("=" * 80)
output_lines.append("RMO概述总结文件内容提取")
output_lines.append("=" * 80)
output_lines.append("")
for file_rel_path, extract_func in files_to_read:
file_path = base_dir / file_rel_path
if not file_path.exists():
output_lines.append(f"文件不存在: {file_rel_path}")
output_lines.append("")
continue
output_lines.append(f"\n{'='*80}")
output_lines.append(f"文件: {file_rel_path}")
output_lines.append(f"{'='*80}\n")
content = extract_func(file_path)
if content:
output_lines.append(content)
else:
output_lines.append("无法提取内容(可能缺少依赖库)")
output_lines.append("\n")
# 输出到文件
output_file = base_dir / "提取内容_概述总结.txt"
with open(output_file, 'w', encoding='utf-8') as f:
f.write('\n'.join(output_lines))
print(f"内容已提取到: {output_file}")
print("\n提取的内容预览:")
print('\n'.join(output_lines[:50]))
if __name__ == "__main__":
main()