Data_Analysis/scripts/convert_数据_to_mock_json.py

201 lines
8.0 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
将「数据」目录下三份 Excel与 #数据与表结构.md 一致)转换为 analytics-demo-web/public/mock/*.json。
派生字段(源表无列时):
- AE occurrenceDate / registrationDate由审核日回溯生成保证 occurrence < registration < review演示用
- AE province由入院量表按医院名称众数映射。
- AE SAE 四布尔:由「伤害表现」关键词启发式判定。
- 入院 productName将 MaterialDesc 映射到 AE投诉 产品名称集合(最长前缀包含匹配),便于与投诉率联表。
"""
from __future__ import annotations
import hashlib
import json
from pathlib import Path
import pandas as pd
ROOT = Path(__file__).resolve().parents[1]
DATA_DIR = ROOT / "数据"
OUT_DIR = ROOT / "analytics-demo-web" / "public" / "mock"
def _d(v) -> pd.Timestamp | pd.NaTType:
if pd.isna(v):
return pd.NaT
return pd.to_datetime(v)
def _fmt(d: pd.Timestamp | pd.NaTType) -> str:
if pd.isna(d):
return ""
return d.strftime("%Y-%m-%d")
def _stable_int(s: str) -> int:
return int(hashlib.md5(s.encode("utf-8")).hexdigest()[:8], 16)
def infer_sae_flags(injury_text: str) -> dict[str, bool]:
t = injury_text or ""
return {
"saeDeath": ("死亡" in t) and ("非死亡" not in t),
"saeLifeThreatening": "危及生命" in t or "生命威胁" in t or "心跳骤停" in t,
"saeDisability": "残疾" in t or "功能障碍" in t,
"saeHospitalization": "住院" in t or "延长住院" in t or "住院时间延长" in t,
}
def map_material_to_product(material_desc: str, canonical_products: list[str]) -> str:
if not isinstance(material_desc, str) or not material_desc.strip():
return ""
s = material_desc.strip()
for p in canonical_products:
if p and p in s:
return p
return s.split()[0] if " " in s else s
def main() -> None:
ae_path = DATA_DIR / "不良事件数据-模拟1000条-20260414.xlsx"
adm_path = DATA_DIR / "入院量数据-模拟1000条-20260414.xlsx"
cmp_path = DATA_DIR / "质量投诉数据-模拟1000条-20260414.xlsx"
ae_df = pd.read_excel(ae_path, sheet_name=0)
adm_df = pd.read_excel(adm_path, sheet_name=0)
cmp_df = pd.read_excel(cmp_path, sheet_name=0)
cmp_df.columns = [str(c).replace("\n", "").strip() for c in cmp_df.columns]
hosp_prov = (
adm_df.groupby("HospitalName")["Province"]
.agg(lambda s: s.mode().iloc[0] if len(s.mode()) else (s.iloc[0] if len(s) else ""))
.to_dict()
)
hospital_keys = sorted(hosp_prov.keys(), key=len, reverse=True)
# 入院量表仅覆盖部分医院;其余 AE 单位按常识补省(仅用于演示地图/省聚合)
province_override: dict[str, str] = {
"中国医科大学附属第一医院": "辽宁省",
"华中科技大学同济医学院附属协和医院": "湖北省",
"昆明医科大学第一附属医院": "云南省",
"天津市肿瘤医院": "天津市",
"重庆医科大学附属第一医院": "重庆市",
}
def resolve_province(unit: str) -> str:
if unit in province_override:
return province_override[unit]
if unit in hosp_prov:
return str(hosp_prov[unit]).strip()
for h in hospital_keys:
if not h:
continue
if h in unit or unit in h:
return str(hosp_prov[h]).strip()
return "(未映射)"
ae_products = set(ae_df["产品名称"].dropna().astype(str).unique())
cmp_products = set(cmp_df["产品名称"].dropna().astype(str).unique())
canonical_products = sorted(ae_products | cmp_products, key=len, reverse=True)
ae_rows: list[dict] = []
for _, row in ae_df.iterrows():
code = str(row.get("报告编码", "")).strip()
review = _d(row.get("审核日期"))
if pd.isna(review):
continue
h = _stable_int(code) % 10000
reg = review - pd.Timedelta(days=3 + (h % 6))
occ = reg - pd.Timedelta(days=1 + (h % 12))
unit = str(row.get("单位名称", "")).strip()
injury = str(row.get("伤害表现", "") or "").strip()
flags = infer_sae_flags(injury)
ae_rows.append(
{
"reportCode": code,
"occurrenceDate": _fmt(occ),
"registrationDate": _fmt(reg),
"unitName": unit,
"businessUnit": str(row.get("事业线", "") or "").strip(),
"productName": str(row.get("产品名称", "") or "").strip(),
"registrationNo": str(row.get("注册证编号/曾用注册证编号", "") or "").strip(),
"model": str(row.get("型号", "") or "").strip(),
"batchNo": str(row.get("产品批号", "") or "").strip(),
"injuryExpression": injury,
"deviceFailure": str(row.get("器械故障表现", "") or "").strip(),
"reviewDate": _fmt(review),
"province": resolve_province(unit) or "(未映射)",
**flags,
}
)
adm_rows: list[dict] = []
for _, row in adm_df.iterrows():
md = row.get("MaterialDesc")
pname = map_material_to_product(str(md) if md == md else "", canonical_products)
adm_rows.append(
{
"year": int(row["Year"]),
"month": int(row["Month"]),
"hospitalName": str(row.get("HospitalName", "") or "").strip(),
"dealerName": str(row.get("DealerName", "") or "").strip(),
"province": str(row.get("Province", "") or "").strip(),
"bu": str(row.get("BU", "") or "").strip(),
"productName": pname,
"cyQty": float(row.get("CY Qty", 0) or 0),
"lyQty": float(row.get("LY Qty", 0) or 0),
"growthQtyPct": round(float(row.get("Growth% Qty", 0) or 0) * 100, 4),
"cyAmt": float(row.get("CY Amt", 0) or 0),
"growthAmtPct": round(float(row.get("Growth% Amt", 0) or 0) * 100, 4),
}
)
def yn_to_bool(v) -> bool:
s = str(v).strip()
return s == "" or s.lower() == "true" or s == "1"
cmp_rows: list[dict] = []
for _, row in cmp_df.iterrows():
reg = _d(row.get("C3登记日期"))
if pd.isna(reg):
continue
close = _d(row.get("关闭日期"))
survey = _d(row.get("调查报告完成日期"))
if pd.isna(close):
close = survey if not pd.isna(survey) else reg
cmp_rows.append(
{
"c3Code": str(row.get("C3编号", "")).strip(),
"model": str(row.get("型号", "") or "").strip(),
"batchNo": str(row.get("批号", "") or "").strip(),
"registrationNo": str(row.get("注册证号", "") or "").strip(),
"productName": str(row.get("产品名称", "") or "").strip(),
"hospitalName": str(row.get("医院名称", "") or "").strip(),
"faultType": str(row.get("故障类型", "") or "").strip(),
"registerDate": _fmt(reg),
"isAe": yn_to_bool(row.get("是否不良事件")),
"conclusion": str(row.get("调查结论(处理结果)", "") or "").strip(),
"compensation": str(row.get("赔付结论", "") or "").strip(),
"closeDate": _fmt(close),
}
)
OUT_DIR.mkdir(parents=True, exist_ok=True)
(OUT_DIR / "ae.json").write_text(
json.dumps(ae_rows, ensure_ascii=False, indent=2),
encoding="utf-8",
)
(OUT_DIR / "admission.json").write_text(
json.dumps(adm_rows, ensure_ascii=False, indent=2),
encoding="utf-8",
)
(OUT_DIR / "complaint.json").write_text(
json.dumps(cmp_rows, ensure_ascii=False, indent=2),
encoding="utf-8",
)
print("Wrote", len(ae_rows), "ae,", len(adm_rows), "admission,", len(cmp_rows), "complaint ->", OUT_DIR)
if __name__ == "__main__":
main()