100 lines
3.2 KiB
Python
100 lines
3.2 KiB
Python
#!/usr/bin/env python3
|
||
# -*- coding: utf-8 -*-
|
||
"""
|
||
提取RMO概述总结文件内容,用于提炼价值主张和优势
|
||
"""
|
||
|
||
import os
|
||
import sys
|
||
from pathlib import Path
|
||
|
||
try:
|
||
from docx import Document
|
||
HAS_DOCX = True
|
||
except ImportError:
|
||
HAS_DOCX = False
|
||
print("警告: python-docx 未安装,无法读取 .docx 文件")
|
||
|
||
try:
|
||
from pptx import Presentation
|
||
HAS_PPTX = True
|
||
except ImportError:
|
||
HAS_PPTX = False
|
||
print("警告: python-pptx 未安装,无法读取 .pptx 文件")
|
||
|
||
def extract_docx(file_path):
|
||
"""提取DOCX文件文本内容"""
|
||
if not HAS_DOCX:
|
||
return None
|
||
try:
|
||
doc = Document(file_path)
|
||
paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
|
||
return '\n'.join(paragraphs)
|
||
except Exception as e:
|
||
return f"读取错误: {e}"
|
||
|
||
def extract_pptx(file_path):
|
||
"""提取PPTX文件文本内容"""
|
||
if not HAS_PPTX:
|
||
return None
|
||
try:
|
||
prs = Presentation(file_path)
|
||
slides_text = []
|
||
for i, slide in enumerate(prs.slides, 1):
|
||
slide_text = []
|
||
for shape in slide.shapes:
|
||
if hasattr(shape, "text") and shape.text.strip():
|
||
slide_text.append(shape.text.strip())
|
||
if slide_text:
|
||
slides_text.append(f"--- 幻灯片 {i} ---\n" + "\n".join(slide_text))
|
||
return "\n\n".join(slides_text)
|
||
except Exception as e:
|
||
return f"读取错误: {e}"
|
||
|
||
def main():
|
||
base_dir = Path(r"d:\SoftwarePrj\RMO网站\资源中心\关于RMO介绍\RMO概述\概述总结")
|
||
|
||
files_to_read = [
|
||
("方案制定/一站式临床试验风险管理:RMO.docx", extract_docx),
|
||
("构建中国药械风险能力责任保障体系(阶段性汇报)【20250505】.pptx", extract_pptx),
|
||
("华泰_构建生命科学领域责任保险新质生产力202407.pptx", extract_pptx),
|
||
("临研安生命科学保险技术支持服务介绍 202505.pptx", extract_pptx),
|
||
("MAH持有人责任风险管理方案-网站建设.pptx", extract_pptx),
|
||
]
|
||
|
||
output_lines = []
|
||
output_lines.append("=" * 80)
|
||
output_lines.append("RMO概述总结文件内容提取")
|
||
output_lines.append("=" * 80)
|
||
output_lines.append("")
|
||
|
||
for file_rel_path, extract_func in files_to_read:
|
||
file_path = base_dir / file_rel_path
|
||
if not file_path.exists():
|
||
output_lines.append(f"文件不存在: {file_rel_path}")
|
||
output_lines.append("")
|
||
continue
|
||
|
||
output_lines.append(f"\n{'='*80}")
|
||
output_lines.append(f"文件: {file_rel_path}")
|
||
output_lines.append(f"{'='*80}\n")
|
||
|
||
content = extract_func(file_path)
|
||
if content:
|
||
output_lines.append(content)
|
||
else:
|
||
output_lines.append("无法提取内容(可能缺少依赖库)")
|
||
output_lines.append("\n")
|
||
|
||
# 输出到文件
|
||
output_file = base_dir / "提取内容_概述总结.txt"
|
||
with open(output_file, 'w', encoding='utf-8') as f:
|
||
f.write('\n'.join(output_lines))
|
||
|
||
print(f"内容已提取到: {output_file}")
|
||
print("\n提取的内容预览:")
|
||
print('\n'.join(output_lines[:50]))
|
||
|
||
if __name__ == "__main__":
|
||
main()
|