cyberandy commited on
Commit
7d64ce5
·
verified ·
1 Parent(s): 82856f6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +91 -85
app.py CHANGED
@@ -20,13 +20,15 @@ st.set_page_config(
20
  # Sidebar
21
  st.sidebar.image("logo-wordlift.png")
22
  language_options = {"English", "English - spaCy", "German"}
 
23
  selected_language = st.sidebar.selectbox("Select the Language", list(language_options), index=0)
24
 
 
 
 
 
25
  # Based on selected language, configure model, entity set, and citation options
26
  if selected_language == "German" or selected_language == "English - spaCy":
27
- selected_model_name = None
28
- selected_entity_set = None
29
-
30
  entity_fishing_citation = """
31
  @misc{entity-fishing,
32
  title = {entity-fishing},
@@ -36,10 +38,9 @@ if selected_language == "German" or selected_language == "English - spaCy":
36
  eprint = {1:dir:cb0ba3379413db12b0018b7c3af8d0d2d864139c}
37
  }
38
  """
39
-
40
  with st.sidebar.expander('Citations'):
41
  st.markdown(entity_fishing_citation)
42
- else:
43
  model_options = ["aida_model", "wikipedia_model_with_numbers"]
44
  entity_set_options = ["wikidata", "wikipedia"]
45
 
@@ -54,36 +55,42 @@ else:
54
  year = "2022"
55
  }
56
  """
57
-
58
  with st.sidebar.expander('Citations'):
59
  st.markdown(refined_citation)
60
 
61
- @st.cache_resource # 👈 Add the caching decorator
62
  def load_model(selected_language, model_name=None, entity_set=None):
 
 
 
63
  if selected_language == "German":
64
  # Load the German-specific model
65
  nlp_model_de = spacy.load("de_core_news_lg")
66
- nlp_model_de.add_pipe("entityfishing")
67
-
68
  return nlp_model_de
 
69
  elif selected_language == "English - spaCy":
70
  # Load English-specific model
71
  nlp_model_en = spacy.load("en_core_web_sm")
72
- nlp_model_en.add_pipe("entityfishing")
73
-
74
- return nlp_model_en
75
- else:
 
76
  # Load the pretrained model for other languages
77
  refined_model = Refined.from_pretrained(model_name=model_name, entity_set=entity_set)
78
  return refined_model
79
 
80
  # Use the cached model
 
 
81
  model = load_model(selected_language, selected_model_name, selected_entity_set)
82
 
83
  # Helper functions
84
- def get_wikidata_id(entity_string):
85
- entity_list = entity_string.split("=")
86
- entity_id = str(entity_list[1])
87
  entity_link = "http://www.wikidata.org/entity/" + entity_id
88
  return {"id": entity_id, "link": entity_link}
89
 
@@ -92,14 +99,15 @@ def get_entity_data(entity_link):
92
  # Format the entity_link
93
  formatted_link = entity_link.replace("http://", "http/")
94
  response = requests.get(f'https://api.wordlift.io/id/{formatted_link}')
 
95
  return response.json()
96
- except Exception as e:
97
- print(f"Exception when fetching data for entity: {entity_link}. Exception: {e}")
98
  return None
99
 
100
  # Create the form
101
  with st.form(key='my_form'):
102
- text_input = st.text_area(label='Enter a sentence')
103
  submit_button = st.form_submit_button(label='Analyze')
104
 
105
  # Initialization
@@ -109,92 +117,90 @@ entities_data = {}
109
  if text_input:
110
  if selected_language in ["German", "English - spaCy"]:
111
  doc = model(text_input)
112
- entities = [(ent.text, ent.label_, ent._.kb_qid, ent._.url_wikidata) for ent in doc.ents]
113
- for entity in entities:
114
  entity_string, entity_type, wikidata_id, wikidata_url = entity
115
  if wikidata_url:
116
- # Ensure correct format for the German and English model
117
  formatted_wikidata_url = wikidata_url.replace("https://www.wikidata.org/wiki/", "http://www.wikidata.org/entity/")
118
  entities_map[entity_string] = {"id": wikidata_id, "link": formatted_wikidata_url}
119
  entity_data = get_entity_data(formatted_wikidata_url)
120
 
121
  if entity_data is not None:
122
  entities_data[entity_string] = entity_data
123
- else:
124
- entities = model.process_text(text_input)
125
-
126
- for entity in entities:
127
- single_entity_list = str(entity).strip('][').replace("\'", "").split(', ')
128
- if len(single_entity_list) >= 2 and "wikidata" in single_entity_list[1]:
129
- entities_map[single_entity_list[0].strip()] = get_wikidata_id(single_entity_list[1])
130
- entity_data = get_entity_data(entities_map[single_entity_list[0].strip()]["link"])
 
 
131
  if entity_data is not None:
132
- entities_data[single_entity_list[0].strip()] = entity_data
133
 
134
- combined_entity_info_dictionary = dict([(k, [entities_map[k], entities_data[k] if k in entities_data else None]) for k in entities_map])
 
 
135
 
136
  if submit_button:
137
- # Prepare a list to hold the final output
138
  final_text = []
139
-
140
- # JSON-LD data
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
141
  json_ld_data = {
142
  "@context": "https://schema.org",
143
  "@type": "WebPage",
144
  "mentions": []
145
  }
146
-
147
- # Replace each entity in the text with its annotated version
148
- for entity_string, entity_info in entities_map.items():
149
- # Check if the entity has a valid Wikidata link
150
- if entity_info["link"] is None or entity_info["link"] == "None":
151
- continue # skip this entity
152
-
153
- entity_data = entities_data.get(entity_string, None)
154
- entity_type = None
155
- if entity_data is not None:
156
- entity_type = entity_data.get("@type", None)
157
-
158
- # Use different colors based on the entity's type
159
- color = "#8ef" # Default color
160
- if entity_type == "Place":
161
- color = "#8AC7DB"
162
- elif entity_type == "Organization":
163
- color = "#ADD8E6"
164
- elif entity_type == "Person":
165
- color = "#67B7D1"
166
- elif entity_type == "Product":
167
- color = "#2ea3f2"
168
- elif entity_type == "CreativeWork":
169
- color = "#00BFFF"
170
- elif entity_type == "Event":
171
- color = "#1E90FF"
172
-
173
- entity_annotation = (entity_string, entity_info["id"], color)
174
- text_input = text_input.replace(entity_string, f'{{{str(entity_annotation)}}}', 1)
175
-
176
- # Add the entity to JSON-LD data
177
- entity_json_ld = combined_entity_info_dictionary[entity_string][1]
178
- if entity_json_ld and entity_json_ld.get("link") != "None":
179
- json_ld_data["mentions"].append(entity_json_ld)
180
-
181
- # Split the modified text_input into a list
182
- text_list = text_input.split("{")
183
-
184
- for item in text_list:
185
- if "}" in item:
186
- item_list = item.split("}")
187
- final_text.append(eval(item_list[0]))
188
- if len(item_list[1]) > 0:
189
- final_text.append(item_list[1])
190
- else:
191
- final_text.append(item)
192
-
193
- # Pass the final_text to the annotated_text function
194
- annotated_text(*final_text)
195
 
196
  with st.expander("See annotations"):
197
  st.write(combined_entity_info_dictionary)
198
 
199
  with st.expander("Here is the final JSON-LD"):
200
- st.json(json_ld_data) # Output JSON-LD
 
20
  # Sidebar
21
  st.sidebar.image("logo-wordlift.png")
22
  language_options = {"English", "English - spaCy", "German"}
23
+ # Set default to English to avoid an error on the first run
24
  selected_language = st.sidebar.selectbox("Select the Language", list(language_options), index=0)
25
 
26
+ # Initialize model and entity set variables
27
+ selected_model_name = None
28
+ selected_entity_set = None
29
+
30
  # Based on selected language, configure model, entity set, and citation options
31
  if selected_language == "German" or selected_language == "English - spaCy":
 
 
 
32
  entity_fishing_citation = """
33
  @misc{entity-fishing,
34
  title = {entity-fishing},
 
38
  eprint = {1:dir:cb0ba3379413db12b0018b7c3af8d0d2d864139c}
39
  }
40
  """
 
41
  with st.sidebar.expander('Citations'):
42
  st.markdown(entity_fishing_citation)
43
+ else: # English (Refined)
44
  model_options = ["aida_model", "wikipedia_model_with_numbers"]
45
  entity_set_options = ["wikidata", "wikipedia"]
46
 
 
55
  year = "2022"
56
  }
57
  """
 
58
  with st.sidebar.expander('Citations'):
59
  st.markdown(refined_citation)
60
 
61
+ @st.cache_resource
62
  def load_model(selected_language, model_name=None, entity_set=None):
63
+ # Define the public URL for the entity-fishing service
64
+ entity_fishing_url = "https://cloud.science-miner.com/nerd/service"
65
+
66
  if selected_language == "German":
67
  # Load the German-specific model
68
  nlp_model_de = spacy.load("de_core_news_lg")
69
+ # Add the entity-fishing pipe with the server URL configured
70
+ nlp_model_de.add_pipe("entityfishing", config={"api_url": entity_fishing_url})
71
  return nlp_model_de
72
+
73
  elif selected_language == "English - spaCy":
74
  # Load English-specific model
75
  nlp_model_en = spacy.load("en_core_web_sm")
76
+ # Add the entity-fishing pipe with the server URL configured
77
+ nlp_model_en.add_pipe("entityfishing", config={"api_url": entity_fishing_url})
78
+ return nlp_model_en
79
+
80
+ else: # English (Refined)
81
  # Load the pretrained model for other languages
82
  refined_model = Refined.from_pretrained(model_name=model_name, entity_set=entity_set)
83
  return refined_model
84
 
85
  # Use the cached model
86
+ # We pass the selected options directly to the cached function
87
+ # Streamlit's caching handles re-running this only when the inputs change
88
  model = load_model(selected_language, selected_model_name, selected_entity_set)
89
 
90
  # Helper functions
91
+ def get_wikidata_id(entity_id_string):
92
+ # Handles IDs like "wikidata:Q123" or "wikidata=Q123"
93
+ entity_id = entity_id_string.split(":")[-1].split("=")[-1]
94
  entity_link = "http://www.wikidata.org/entity/" + entity_id
95
  return {"id": entity_id, "link": entity_link}
96
 
 
99
  # Format the entity_link
100
  formatted_link = entity_link.replace("http://", "http/")
101
  response = requests.get(f'https://api.wordlift.io/id/{formatted_link}')
102
+ response.raise_for_status() # Raise an exception for bad status codes
103
  return response.json()
104
+ except requests.exceptions.RequestException as e:
105
+ st.warning(f"Could not fetch data for entity: {entity_link}. Error: {e}")
106
  return None
107
 
108
  # Create the form
109
  with st.form(key='my_form'):
110
+ text_input = st.text_area(label='Enter a sentence', value="Angela Merkel was the first female chancellor of Germany.")
111
  submit_button = st.form_submit_button(label='Analyze')
112
 
113
  # Initialization
 
117
  if text_input:
118
  if selected_language in ["German", "English - spaCy"]:
119
  doc = model(text_input)
120
+ spacy_entities = [(ent.text, ent.label_, ent._.kb_qid, ent._.url_wikidata) for ent in doc.ents]
121
+ for entity in spacy_entities:
122
  entity_string, entity_type, wikidata_id, wikidata_url = entity
123
  if wikidata_url:
 
124
  formatted_wikidata_url = wikidata_url.replace("https://www.wikidata.org/wiki/", "http://www.wikidata.org/entity/")
125
  entities_map[entity_string] = {"id": wikidata_id, "link": formatted_wikidata_url}
126
  entity_data = get_entity_data(formatted_wikidata_url)
127
 
128
  if entity_data is not None:
129
  entities_data[entity_string] = entity_data
130
+ else: # Refined model
131
+ refined_entities = model.process_text(text_input)
132
+
133
+ for entity in refined_entities:
134
+ # More robustly access entity attributes instead of parsing a string
135
+ if entity.entity_id and "wikidata" in entity.entity_id:
136
+ entity_text = entity.text
137
+ wikidata_info = get_wikidata_id(entity.entity_id)
138
+ entities_map[entity_text] = wikidata_info
139
+ entity_data = get_entity_data(wikidata_info["link"])
140
  if entity_data is not None:
141
+ entities_data[entity_text] = entity_data
142
 
143
+ combined_entity_info_dictionary = {
144
+ k: [entities_map[k], entities_data.get(k)] for k in entities_map
145
+ }
146
 
147
  if submit_button:
148
+ # A more robust way to build the annotated_text list without using eval()
149
  final_text = []
150
+ current_pos = 0
151
+
152
+ # Create a simple list of (text, start, end) for sorting
153
+ entity_spans = []
154
+ if selected_language in ["German", "English - spaCy"]:
155
+ # 'doc' is available from the processing block above
156
+ for ent in doc.ents:
157
+ if ent.text in entities_map: # only include linked entities
158
+ entity_spans.append((ent.text, ent.start_char, ent.end_char))
159
+ else:
160
+ # 'refined_entities' is available
161
+ for ent in refined_entities:
162
+ if ent.text in entities_map:
163
+ entity_spans.append((ent.text, ent.span[0], ent.span[1]))
164
+
165
+ # Sort entities by their starting position to handle the text correctly
166
+ sorted_entities = sorted(entity_spans, key=lambda x: x[1])
167
+
168
+ for entity_string, start, end in sorted_entities:
169
+ # Add the text segment before the current entity
170
+ final_text.append(text_input[current_pos:start])
171
+
172
+ # Prepare the annotation for the entity
173
+ entity_info = entities_map.get(entity_string, {})
174
+ entity_id = entity_info.get("id", "N/A")
175
+
176
+ entity_type_data = entities_data.get(entity_string)
177
+ entity_type = entity_type_data.get("@type") if entity_type_data else None
178
+
179
+ color = {"Place": "#8AC7DB", "Organization": "#ADD8E6", "Person": "#67B7D1",
180
+ "Product": "#2ea3f2", "CreativeWork": "#00BFFF", "Event": "#1E90FF"}.get(entity_type, "#8ef")
181
+
182
+ final_text.append((entity_string, entity_id, color))
183
+ current_pos = end
184
+
185
+ # Add any remaining text after the last entity
186
+ final_text.append(text_input[current_pos:])
187
+
188
+ st.header("Annotated Text")
189
+ annotated_text(*[item for item in final_text if item]) # Filter out empty strings
190
+
191
+ # --- JSON-LD Generation ---
192
  json_ld_data = {
193
  "@context": "https://schema.org",
194
  "@type": "WebPage",
195
  "mentions": []
196
  }
197
+ for entity_string, info_list in combined_entity_info_dictionary.items():
198
+ entity_json_ld = info_list[1] # The data from WordLift API
199
+ if entity_json_ld:
200
+ json_ld_data["mentions"].append(entity_json_ld)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
201
 
202
  with st.expander("See annotations"):
203
  st.write(combined_entity_info_dictionary)
204
 
205
  with st.expander("Here is the final JSON-LD"):
206
+ st.json(json_ld_data)