Spaces:

gpaasch
/

MedCodeMCP

Build error

File size: 1,575 Bytes

0d38280

import xml.etree.ElementTree as ET
import json
import sys
import os

def main(xml_path):
    if not os.path.isfile(xml_path):
        print(f"ERROR: cannot find tabular XML at '{xml_path}'")
        sys.exit(1)

    tree = ET.parse(xml_path)
    root = tree.getroot()

    icd_to_description = {}

    # Iterate over every <diag> in the entire file, recursively.
    # Each <diag> has:
    #   • <name>  (the ICD-10 code)
    #   • <desc>  (the human-readable description)
    #   • zero or more nested <diag> children (sub-codes).
    for diag in root.iter("diag"):
        name_elem = diag.find("name")
        desc_elem = diag.find("desc")
        if name_elem is None or desc_elem is None:
            continue
        # Some <diag> nodes might have <name/> or <desc/> with no text; skip those.
        if name_elem.text is None or desc_elem.text is None:
            continue

        code = name_elem.text.strip()
        description = desc_elem.text.strip()
        # Only store non-empty strings:
        if code and description:
            icd_to_description[code] = description

    # Write out a flat JSON mapping code → description
    out_path = "icd_to_description.json"
    with open(out_path, "w", encoding="utf-8") as fp:
        json.dump(icd_to_description, fp, indent=2, ensure_ascii=False)

    print(f"Wrote {len(icd_to_description)} code entries to {out_path}")


if __name__ == "__main__":
    if len(sys.argv) != 2:
        print("Usage: python parse_tabular.py <path/to/icd10cm_tabular_2025.xml>")
        sys.exit(1)
    main(sys.argv[1])