gpaasch commited on
Commit
357828f
·
1 Parent(s): 36b0f3d

hf embeddings for llama index had to be installed: https://docs.llamaindex.ai/en/stable/module_guides/models/embeddings/

Browse files
data/processed/icd_to_description.json ADDED
The diff for this file is too large to render. See raw diff
 
requirements.txt CHANGED
@@ -4,6 +4,7 @@ gradio
4
 
5
  # core Llama-Index + HF model support
6
  llama-index>=0.9.0 # Specify minimum version
 
7
  openai
8
  transformers
9
  torch
@@ -12,6 +13,7 @@ accelerate
12
  # optional extras
13
  langchain
14
  langchain-community
 
15
 
16
  # system requirement for audio I/O (ffmpeg must be installed):
17
  # • Debian/Ubuntu: sudo apt install ffmpeg
 
4
 
5
  # core Llama-Index + HF model support
6
  llama-index>=0.9.0 # Specify minimum version
7
+ llama-index-embeddings-huggingface
8
  openai
9
  transformers
10
  torch
 
13
  # optional extras
14
  langchain
15
  langchain-community
16
+ sentence-transformers>=2.2.0
17
 
18
  # system requirement for audio I/O (ffmpeg must be installed):
19
  # • Debian/Ubuntu: sudo apt install ffmpeg
src/__init__.py ADDED
File without changes
src/app.py CHANGED
@@ -1,10 +1,12 @@
1
  import os
2
  import gradio as gr
3
- from llama_index import HuggingFaceLLMPredictor
4
- from src.parse_tabular import symptom_index
 
5
 
6
- # --- LlamaIndex utils import ---
7
- from utils.llama_index_utils import get_llm_predictor, build_index, query_symptoms
 
8
 
9
  # --- System prompt ---
10
  SYSTEM_PROMPT = """
 
1
  import os
2
  import gradio as gr
3
+ from llama_index.core import VectorStoreIndex
4
+ from llama_index.llms import HuggingFaceLLMPredictor
5
+ from llama_index.readers import SimpleDirectoryReader
6
 
7
+ # Relative imports should be explicit
8
+ from parse_tabular import symptom_index # Changed from relative import
9
+ from ..utils.llama_index_utils import get_llm_predictor, build_index, query_symptoms
10
 
11
  # --- System prompt ---
12
  SYSTEM_PROMPT = """
src/parse_tabular.py CHANGED
@@ -2,8 +2,19 @@ import xml.etree.ElementTree as ET
2
  import json
3
  import sys
4
  import os
 
 
 
 
 
 
 
 
5
 
6
- def main(xml_path):
 
 
 
7
  if not os.path.isfile(xml_path):
8
  print(f"ERROR: cannot find tabular XML at '{xml_path}'")
9
  sys.exit(1)
@@ -34,15 +45,41 @@ def main(xml_path):
34
  icd_to_description[code] = description
35
 
36
  # Write out a flat JSON mapping code → description
37
- out_path = "icd_to_description.json"
38
  with open(out_path, "w", encoding="utf-8") as fp:
39
  json.dump(icd_to_description, fp, indent=2, ensure_ascii=False)
40
 
41
  print(f"Wrote {len(icd_to_description)} code entries to {out_path}")
42
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
 
44
  if __name__ == "__main__":
45
- if len(sys.argv) != 2:
46
- print("Usage: python parse_tabular.py <path/to/icd10cm_tabular_2025.xml>")
47
- sys.exit(1)
48
- main(sys.argv[1])
 
 
2
  import json
3
  import sys
4
  import os
5
+ from llama_index.core import VectorStoreIndex, Document, Settings
6
+ from llama_index.embeddings.huggingface import HuggingFaceEmbedding
7
+ # Update path constants
8
+ BASE_DIR = os.path.dirname(os.path.dirname(__file__))
9
+ DATA_DIR = os.path.join(BASE_DIR, "data")
10
+ ICD_DIR = os.path.join(DATA_DIR, "icd10cm_tabular_2025")
11
+ DEFAULT_XML_PATH = os.path.join(ICD_DIR, "icd10cm_tabular_2025.xml")
12
+ PROCESSED_DIR = os.path.join(DATA_DIR, "processed")
13
 
14
+ def main(xml_path=DEFAULT_XML_PATH):
15
+ # Create processed directory if it doesn't exist
16
+ os.makedirs(PROCESSED_DIR, exist_ok=True)
17
+
18
  if not os.path.isfile(xml_path):
19
  print(f"ERROR: cannot find tabular XML at '{xml_path}'")
20
  sys.exit(1)
 
45
  icd_to_description[code] = description
46
 
47
  # Write out a flat JSON mapping code → description
48
+ out_path = os.path.join(PROCESSED_DIR, "icd_to_description.json")
49
  with open(out_path, "w", encoding="utf-8") as fp:
50
  json.dump(icd_to_description, fp, indent=2, ensure_ascii=False)
51
 
52
  print(f"Wrote {len(icd_to_description)} code entries to {out_path}")
53
 
54
+ def create_symptom_index():
55
+ # Configure to use local HuggingFace embeddings
56
+ Settings.embed_model = HuggingFaceEmbedding(
57
+ model_name="sentence-transformers/all-MiniLM-L6-v2"
58
+ )
59
+
60
+ # Load and process data
61
+ json_path = os.path.join(PROCESSED_DIR, "icd_to_description.json")
62
+ with open(json_path, "r") as f:
63
+ icd_data = json.load(f)
64
+
65
+ # Convert to Document objects
66
+ documents = [
67
+ Document(
68
+ text=f"ICD-10 Code {code}: {desc}",
69
+ metadata={"code": code}
70
+ )
71
+ for code, desc in icd_data.items()
72
+ ]
73
+
74
+ # Create and return the index
75
+ return VectorStoreIndex.from_documents(documents)
76
+
77
+ # Move this outside the main() function
78
+ symptom_index = None
79
 
80
  if __name__ == "__main__":
81
+ if len(sys.argv) > 1:
82
+ main(sys.argv[1])
83
+ else:
84
+ main() # Use default path
85
+ symptom_index = create_symptom_index()
src/symptom_to_icd.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "cough": ["R05"],
3
+ "fever": ["R50.9"],
4
+ "headache": ["R51"]
5
+ }
utils/__init__.py ADDED
File without changes