Data_Analysis/数据准备过程/模拟数据生成-提示词/generate_simulated_complain...

255 lines
8.7 KiB
Python

# -*- coding: utf-8 -*-
"""按生成质量投诉模拟数据.md生成质量投诉模拟数据。"""
from __future__ import annotations
import random
from datetime import date, timedelta
from pathlib import Path
from openpyxl import Workbook, load_workbook
from openpyxl.utils import get_column_letter
ROOT = Path(__file__).resolve().parent
TEMPLATE = ROOT / "质量投诉数据-表头.xlsx"
PRODUCTS = [
{
"division": "输液治疗",
"bu": "IV Therapy BU",
"name": "一次性使用静脉留置针",
"models": ["20G", "22G", "24G", "Introcan Safety 20G"],
"faults": [
("渗漏", "使用中发现连接处渗液,影响持续输注"),
("连接不牢", "与延长管连接后松动,需重复连接"),
("流速异常", "滴速不稳定,出现间断性停止"),
("包装破损", "开包前发现无菌包装破损"),
],
"cases_max": 3,
},
{
"division": "输液治疗",
"bu": "IV Therapy BU",
"name": "一次性使用输液器",
"models": ["IS-5.0", "IS-7.0", "PF-5.0"],
"faults": [
("堵塞", "输液过程中阻力升高,液体无法正常滴注"),
("渗漏", "滴斗下方接口渗漏,需更换器械"),
("标签不清", "批号标签印刷不清,追溯困难"),
("流速异常", "调节轮无法稳定控制滴速"),
],
"cases_max": 4,
},
{
"division": "透析",
"bu": "Renal Care BU",
"name": "血液透析器",
"models": ["1.3m²", "1.4m²", "HD-180"],
"faults": [
("连接不牢", "血路连接后出现轻度渗血风险"),
("断裂", "外壳边缘出现细微裂纹,已停止使用"),
("流速异常", "跨膜压波动导致透析流程中断"),
("包装破损", "透析器外包装密封不完整"),
],
"cases_max": 2,
},
{
"division": "透析",
"bu": "Renal Care BU",
"name": "血液透析管路",
"models": ["AV-SET-A", "AV-SET-P"],
"faults": [
("渗漏", "动脉端管路连接位出现渗液"),
("堵塞", "泵管段阻力偏高触发报警"),
("连接不牢", "静脉端接头锁定后仍可松脱"),
],
"cases_max": 2,
},
{
"division": "外科",
"bu": "Surgical BU",
"name": "可吸收性外科缝线",
"models": ["3-0", "4-0", "5-0"],
"faults": [
("断裂", "缝合过程中线体提前断裂"),
("标签不清", "规格标签与实际颜色识别困难"),
("包装破损", "单支包装封口松开,未使用"),
],
"cases_max": 2,
},
]
HOSPITALS = [
"上海市第一人民医院",
"浙江大学医学院附属第二医院",
"四川大学华西医院",
"广东省人民医院",
"中南大学湘雅医院",
"山东大学齐鲁医院",
"西安交通大学第一附属医院",
"南京鼓楼医院",
"福建省立医院",
"郑州大学第一附属医院",
]
CONTACTS = ["张医生", "李护士长", "王老师", "陈医生", "赵工", "刘老师", "周护士"]
REPORTERS = ["设备科-赵工", "护理部-陈老师", "质控办-王老师", "临床工程-李工"]
MANUFACTURERS = [
"贝朗医疗(上海)国际贸易有限公司",
"贝朗医疗(苏州)有限公司",
"贝朗爱敦(上海)医疗管理有限公司",
]
AE_NEG_REASONS = ["未造成患者伤害", "仅质量缺陷,无临床后果", "使用前发现缺陷,未接触患者"]
CONCLUSIONS = ["产品缺陷成立", "操作不当", "运输损伤", "未复现", "资料不足"]
PAYMENTS = ["无赔付", "换货", "折让", "退款", "其他协商处理"]
STATUS_POOL = ["新建", "调查中", "待补充", "已关闭"]
def rand_reg_no(rng: random.Random) -> str:
return f"国械注进20{rng.randint(15,26)}{rng.randint(100000,999999)}"
def rand_batch(rng: random.Random) -> str:
return f"{rng.choice('ABCDEFGH')}{rng.randint(1000000,9999999)}"
def rand_phone(rng: random.Random) -> str:
return f"1{rng.choice('3456789')}{rng.randint(100,999)}****{rng.randint(1000,9999)}"
def rand_serial(rng: random.Random, use_na: bool) -> str:
if use_na:
return "N/A"
return f"SN{rng.randint(10**9,10**10-1)}"
def choose_resolution(rng: random.Random, conclusion: str) -> tuple[str, str]:
if conclusion == "产品缺陷成立":
return "Replace", "更换同批次产品并加强到货检验"
if conclusion == "操作不当":
return "Training reinforced", "复测未见异常,建议规范操作培训"
if conclusion == "运输损伤":
return "Logistics improved", "判定为运输环节损伤,已优化包装与交付流程"
if conclusion == "未复现":
return "No defect found", "复测未见异常,建议持续观察后续批次"
return "Need more info", "现有证据不足,待补充样品及记录"
def generate_rows(n: int, seed: int) -> list[list]:
rng = random.Random(seed)
rows: list[list] = []
today = date.today()
start = today - timedelta(days=980)
for i in range(1, n + 1):
p = rng.choice(PRODUCTS)
fault, detail = rng.choice(p["faults"])
hospital = rng.choice(HOSPITALS)
c3_date = start + timedelta(days=rng.randint(0, (today - start).days))
c3_month = c3_date.strftime("%Y-%m")
is_ae = "" if rng.random() < 0.28 else ""
ae_no_reason = "" if is_ae == "" else rng.choice(AE_NEG_REASONS)
status = rng.choices(STATUS_POOL, weights=[0.15, 0.25, 0.1, 0.5], k=1)[0]
report_done = None
closed_date = None
if status in ("调查中", "已关闭", "待补充"):
report_done = c3_date + timedelta(days=rng.randint(3, 45))
if status == "已关闭":
base = report_done if report_done else c3_date
closed_date = base + timedelta(days=rng.randint(1, 30))
bad_qty = rng.randint(1, 20)
sample_return = "" if rng.random() < 0.78 else ""
if sample_return == "":
qa_qty = 0 if rng.random() < 0.95 else rng.randint(0, bad_qty)
else:
qa_qty = rng.randint(1, bad_qty)
factory_qty = rng.randint(0, qa_qty)
case_count = 1 if rng.random() < 0.85 else rng.randint(2, p["cases_max"])
conclusion = rng.choice(CONCLUSIONS)
payment = rng.choice(PAYMENTS)
opinion_en, opinion_cn = choose_resolution(rng, conclusion)
# make payment more coherent
if conclusion in ("未复现", "资料不足", "操作不当"):
payment = rng.choices(["无赔付", "其他协商处理", "折让"], weights=[0.7, 0.2, 0.1], k=1)[0]
elif conclusion == "产品缺陷成立":
payment = rng.choices(["换货", "退款", "折让", "无赔付"], weights=[0.5, 0.2, 0.2, 0.1], k=1)[0]
row = [
f"C3-{today:%Y}-{i:06d}",
rng.choice(p["models"]),
rand_batch(rng),
rand_serial(rng, use_na=("输液" in p["division"] and rng.random() < 0.6)),
rng.choice(MANUFACTURERS),
rand_reg_no(rng),
p["name"],
hospital,
rng.choice(CONTACTS),
rand_phone(rng),
fault,
detail,
rng.choice(REPORTERS),
p["bu"],
c3_date,
c3_month,
is_ae,
ae_no_reason,
bad_qty,
qa_qty,
factory_qty,
report_done,
opinion_en,
opinion_cn,
conclusion,
payment,
closed_date,
status,
sample_return,
p["division"],
case_count,
]
rows.append(row)
return rows
def template_headers() -> list[str]:
wb = load_workbook(TEMPLATE, read_only=True, data_only=True)
ws = wb.active
headers = [ws.cell(1, j).value for j in range(1, ws.max_column + 1)]
wb.close()
return headers
def main() -> None:
headers = template_headers()
today = date.today()
out_path = ROOT / f"质量投诉数据-模拟1000条-{today:%Y%m%d}.xlsx"
rows = generate_rows(1000, int(today.strftime("%Y%m%d")))
wb = Workbook()
ws = wb.active
ws.title = "complaint form"
ws.append(headers)
for r in rows:
ws.append(r)
# date columns
for r in range(2, 1002):
for col in ("O", "V", "AA"):
if ws[f"{col}{r}"].value:
ws[f"{col}{r}"].number_format = "YYYY-MM-DD"
for i in range(1, len(headers) + 1):
ws.column_dimensions[get_column_letter(i)].width = 16
wb.save(out_path)
print(f"Written: {out_path}")
if __name__ == "__main__":
main()