cyberandy commited on
Commit
e8957f4
·
verified ·
1 Parent(s): 87c144f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +167 -123
app.py CHANGED
@@ -20,15 +20,13 @@ st.set_page_config(
20
  # Sidebar
21
  st.sidebar.image("logo-wordlift.png")
22
  language_options = {"English", "English - spaCy", "German"}
23
- # Set default to English to avoid an error on the first run
24
  selected_language = st.sidebar.selectbox("Select the Language", list(language_options), index=0)
25
 
26
- # Initialize model and entity set variables
27
- selected_model_name = None
28
- selected_entity_set = None
29
-
30
  # Based on selected language, configure model, entity set, and citation options
31
  if selected_language == "German" or selected_language == "English - spaCy":
 
 
 
32
  entity_fishing_citation = """
33
  @misc{entity-fishing,
34
  title = {entity-fishing},
@@ -38,12 +36,14 @@ if selected_language == "German" or selected_language == "English - spaCy":
38
  eprint = {1:dir:cb0ba3379413db12b0018b7c3af8d0d2d864139c}
39
  }
40
  """
 
41
  with st.sidebar.expander('Citations'):
42
  st.markdown(entity_fishing_citation)
43
- else: # English (Refined)
 
44
  model_options = ["aida_model", "wikipedia_model_with_numbers"]
45
  entity_set_options = ["wikidata", "wikipedia"]
46
-
47
  selected_model_name = st.sidebar.selectbox("Select the Model", model_options)
48
  selected_entity_set = st.sidebar.selectbox("Select the Entity Set", entity_set_options)
49
 
@@ -55,152 +55,196 @@ else: # English (Refined)
55
  year = "2022"
56
  }
57
  """
 
58
  with st.sidebar.expander('Citations'):
59
  st.markdown(refined_citation)
60
 
61
- @st.cache_resource
62
  def load_model(selected_language, model_name=None, entity_set=None):
63
- # Define the public URL for the entity-fishing service
64
- entity_fishing_url = "https://cloud.science-miner.com/nerd/service"
65
-
66
- if selected_language == "German":
67
- # Load the German-specific model
68
- nlp_model_de = spacy.load("de_core_news_lg")
69
- # Add the entity-fishing pipe with the server URL configured
70
- nlp_model_de.add_pipe("entityfishing", config={"api_url": entity_fishing_url})
71
- return nlp_model_de
72
-
73
- elif selected_language == "English - spaCy":
74
- # Load English-specific model
75
- nlp_model_en = spacy.load("en_core_web_sm")
76
- # Add the entity-fishing pipe with the server URL configured
77
- nlp_model_en.add_pipe("entityfishing", config={"api_url": entity_fishing_url})
78
- return nlp_model_en
79
-
80
- else: # English (Refined)
81
- # Load the pretrained model for other languages
82
- refined_model = Refined.from_pretrained(model_name=model_name, entity_set=entity_set)
83
- return refined_model
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
 
85
  # Use the cached model
86
- # We pass the selected options directly to the cached function
87
- # Streamlit's caching handles re-running this only when the inputs change
88
  model = load_model(selected_language, selected_model_name, selected_entity_set)
89
 
90
  # Helper functions
91
- def get_wikidata_id(entity_id_string):
92
- # Handles IDs like "wikidata:Q123" or "wikidata=Q123"
93
- entity_id = entity_id_string.split(":")[-1].split("=")[-1]
94
  entity_link = "http://www.wikidata.org/entity/" + entity_id
95
  return {"id": entity_id, "link": entity_link}
96
-
97
  def get_entity_data(entity_link):
98
  try:
99
  # Format the entity_link
100
  formatted_link = entity_link.replace("http://", "http/")
101
  response = requests.get(f'https://api.wordlift.io/id/{formatted_link}')
102
- response.raise_for_status() # Raise an exception for bad status codes
103
  return response.json()
104
- except requests.exceptions.RequestException as e:
105
- st.warning(f"Could not fetch data for entity: {entity_link}. Error: {e}")
106
  return None
107
-
108
  # Create the form
109
  with st.form(key='my_form'):
110
- text_input = st.text_area(label='Enter a sentence', value="Angela Merkel was the first female chancellor of Germany.")
111
  submit_button = st.form_submit_button(label='Analyze')
112
 
113
  # Initialization
114
  entities_map = {}
115
  entities_data = {}
116
 
117
- if text_input:
118
- if selected_language in ["German", "English - spaCy"]:
119
- doc = model(text_input)
120
- spacy_entities = [(ent.text, ent.label_, ent._.kb_qid, ent._.url_wikidata) for ent in doc.ents]
121
- for entity in spacy_entities:
122
- entity_string, entity_type, wikidata_id, wikidata_url = entity
123
- if wikidata_url:
124
- formatted_wikidata_url = wikidata_url.replace("https://www.wikidata.org/wiki/", "http://www.wikidata.org/entity/")
125
- entities_map[entity_string] = {"id": wikidata_id, "link": formatted_wikidata_url}
126
- entity_data = get_entity_data(formatted_wikidata_url)
127
-
128
- if entity_data is not None:
129
- entities_data[entity_string] = entity_data
130
- else: # Refined model
131
- refined_entities = model.process_text(text_input)
132
-
133
- for entity in refined_entities:
134
- # More robustly access entity attributes instead of parsing a string
135
- if entity.entity_id and "wikidata" in entity.entity_id:
136
- entity_text = entity.text
137
- wikidata_info = get_wikidata_id(entity.entity_id)
138
- entities_map[entity_text] = wikidata_info
139
- entity_data = get_entity_data(wikidata_info["link"])
140
- if entity_data is not None:
141
- entities_data[entity_text] = entity_data
142
-
143
- combined_entity_info_dictionary = {
144
- k: [entities_map[k], entities_data.get(k)] for k in entities_map
145
- }
146
-
147
- if submit_button:
148
- # A more robust way to build the annotated_text list without using eval()
149
- final_text = []
150
- current_pos = 0
151
-
152
- # Create a simple list of (text, start, end) for sorting
153
- entity_spans = []
154
  if selected_language in ["German", "English - spaCy"]:
155
- # 'doc' is available from the processing block above
 
 
 
 
156
  for ent in doc.ents:
157
- if ent.text in entities_map: # only include linked entities
158
- entity_spans.append((ent.text, ent.start_char, ent.end_char))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
159
  else:
160
- # 'refined_entities' is available
161
- for ent in refined_entities:
162
- if ent.text in entities_map:
163
- entity_spans.append((ent.text, ent.span[0], ent.span[1]))
164
-
165
- # Sort entities by their starting position to handle the text correctly
166
- sorted_entities = sorted(entity_spans, key=lambda x: x[1])
167
-
168
- for entity_string, start, end in sorted_entities:
169
- # Add the text segment before the current entity
170
- final_text.append(text_input[current_pos:start])
171
-
172
- # Prepare the annotation for the entity
173
- entity_info = entities_map.get(entity_string, {})
174
- entity_id = entity_info.get("id", "N/A")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
175
 
176
- entity_type_data = entities_data.get(entity_string)
177
- entity_type = entity_type_data.get("@type") if entity_type_data else None
178
-
179
- color = {"Place": "#8AC7DB", "Organization": "#ADD8E6", "Person": "#67B7D1",
180
- "Product": "#2ea3f2", "CreativeWork": "#00BFFF", "Event": "#1E90FF"}.get(entity_type, "#8ef")
181
 
182
- final_text.append((entity_string, entity_id, color))
183
- current_pos = end
184
-
185
- # Add any remaining text after the last entity
186
- final_text.append(text_input[current_pos:])
 
 
 
 
 
 
 
 
 
187
 
188
- st.header("Annotated Text")
189
- annotated_text(*[item for item in final_text if item]) # Filter out empty strings
190
-
191
- # --- JSON-LD Generation ---
192
- json_ld_data = {
193
- "@context": "https://schema.org",
194
- "@type": "WebPage",
195
- "mentions": []
196
- }
197
- for entity_string, info_list in combined_entity_info_dictionary.items():
198
- entity_json_ld = info_list[1] # The data from WordLift API
199
- if entity_json_ld:
200
- json_ld_data["mentions"].append(entity_json_ld)
201
-
202
- with st.expander("See annotations"):
203
- st.write(combined_entity_info_dictionary)
204
 
205
- with st.expander("Here is the final JSON-LD"):
206
- st.json(json_ld_data)
 
 
 
20
  # Sidebar
21
  st.sidebar.image("logo-wordlift.png")
22
  language_options = {"English", "English - spaCy", "German"}
 
23
  selected_language = st.sidebar.selectbox("Select the Language", list(language_options), index=0)
24
 
 
 
 
 
25
  # Based on selected language, configure model, entity set, and citation options
26
  if selected_language == "German" or selected_language == "English - spaCy":
27
+ selected_model_name = None
28
+ selected_entity_set = None
29
+
30
  entity_fishing_citation = """
31
  @misc{entity-fishing,
32
  title = {entity-fishing},
 
36
  eprint = {1:dir:cb0ba3379413db12b0018b7c3af8d0d2d864139c}
37
  }
38
  """
39
+
40
  with st.sidebar.expander('Citations'):
41
  st.markdown(entity_fishing_citation)
42
+
43
+ else:
44
  model_options = ["aida_model", "wikipedia_model_with_numbers"]
45
  entity_set_options = ["wikidata", "wikipedia"]
46
+
47
  selected_model_name = st.sidebar.selectbox("Select the Model", model_options)
48
  selected_entity_set = st.sidebar.selectbox("Select the Entity Set", entity_set_options)
49
 
 
55
  year = "2022"
56
  }
57
  """
58
+
59
  with st.sidebar.expander('Citations'):
60
  st.markdown(refined_citation)
61
 
62
+ @st.cache_resource # 👈 Add the caching decorator
63
  def load_model(selected_language, model_name=None, entity_set=None):
64
+ try:
65
+ if selected_language == "German":
66
+ # Load the German-specific model
67
+ nlp_model_de = spacy.load("de_core_news_lg")
68
+
69
+ # Check if entityfishing component is available
70
+ if "entityfishing" not in nlp_model_de.pipe_names:
71
+ try:
72
+ nlp_model_de.add_pipe("entityfishing")
73
+ except Exception as e:
74
+ st.error(f"Error adding entityfishing component: {e}")
75
+ st.error("Please make sure entity-fishing is properly installed and configured.")
76
+ return None
77
+
78
+ return nlp_model_de
79
+
80
+ elif selected_language == "English - spaCy":
81
+ # Load English-specific model
82
+ nlp_model_en = spacy.load("en_core_web_sm")
83
+
84
+ # Check if entityfishing component is available
85
+ if "entityfishing" not in nlp_model_en.pipe_names:
86
+ try:
87
+ nlp_model_en.add_pipe("entityfishing")
88
+ except Exception as e:
89
+ st.error(f"Error adding entityfishing component: {e}")
90
+ st.error("Please make sure entity-fishing is properly installed and configured.")
91
+ return None
92
+
93
+ return nlp_model_en
94
+ else:
95
+ # Load the pretrained model for other languages
96
+ refined_model = Refined.from_pretrained(model_name=model_name, entity_set=entity_set)
97
+ return refined_model
98
+ except Exception as e:
99
+ st.error(f"Error loading model: {e}")
100
+ return None
101
 
102
  # Use the cached model
 
 
103
  model = load_model(selected_language, selected_model_name, selected_entity_set)
104
 
105
  # Helper functions
106
+ def get_wikidata_id(entity_string):
107
+ entity_list = entity_string.split("=")
108
+ entity_id = str(entity_list[1])
109
  entity_link = "http://www.wikidata.org/entity/" + entity_id
110
  return {"id": entity_id, "link": entity_link}
111
+
112
  def get_entity_data(entity_link):
113
  try:
114
  # Format the entity_link
115
  formatted_link = entity_link.replace("http://", "http/")
116
  response = requests.get(f'https://api.wordlift.io/id/{formatted_link}')
 
117
  return response.json()
118
+ except Exception as e:
119
+ print(f"Exception when fetching data for entity: {entity_link}. Exception: {e}")
120
  return None
121
+
122
  # Create the form
123
  with st.form(key='my_form'):
124
+ text_input = st.text_area(label='Enter a sentence')
125
  submit_button = st.form_submit_button(label='Analyze')
126
 
127
  # Initialization
128
  entities_map = {}
129
  entities_data = {}
130
 
131
+ if text_input and model is not None:
132
+ try:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
133
  if selected_language in ["German", "English - spaCy"]:
134
+ # Process the text with error handling
135
+ doc = model(text_input)
136
+
137
+ # Fixed the syntax error: ent._.kb_qid instead of ent..kb_qid
138
+ entities = []
139
  for ent in doc.ents:
140
+ try:
141
+ # Check if the custom attributes exist
142
+ kb_qid = getattr(ent._, 'kb_qid', None) if hasattr(ent, '_') else None
143
+ url_wikidata = getattr(ent._, 'url_wikidata', None) if hasattr(ent, '_') else None
144
+ entities.append((ent.text, ent.label_, kb_qid, url_wikidata))
145
+ except AttributeError as e:
146
+ # If the entityfishing attributes don't exist, use basic entity info
147
+ entities.append((ent.text, ent.label_, None, None))
148
+
149
+ for entity in entities:
150
+ entity_string, entity_type, wikidata_id, wikidata_url = entity
151
+ if wikidata_url:
152
+ # Ensure correct format for the German and English model
153
+ formatted_wikidata_url = wikidata_url.replace("https://www.wikidata.org/wiki/", "http://www.wikidata.org/entity/")
154
+ entities_map[entity_string] = {"id": wikidata_id, "link": formatted_wikidata_url}
155
+ entity_data = get_entity_data(formatted_wikidata_url)
156
+
157
+ if entity_data is not None:
158
+ entities_data[entity_string] = entity_data
159
  else:
160
+ entities = model.process_text(text_input)
161
+
162
+ for entity in entities:
163
+ single_entity_list = str(entity).strip('][').replace("\'", "").split(', ')
164
+ if len(single_entity_list) >= 2 and "wikidata" in single_entity_list[1]:
165
+ entities_map[single_entity_list[0].strip()] = get_wikidata_id(single_entity_list[1])
166
+ entity_data = get_entity_data(entities_map[single_entity_list[0].strip()]["link"])
167
+ if entity_data is not None:
168
+ entities_data[single_entity_list[0].strip()] = entity_data
169
+
170
+ except Exception as e:
171
+ st.error(f"Error processing text: {e}")
172
+ if "entityfishing" in str(e).lower():
173
+ st.error("This appears to be an entity-fishing related error. Please ensure:")
174
+ st.error("1. Entity-fishing service is running")
175
+ st.error("2. spacyfishing package is properly installed")
176
+ st.error("3. Network connectivity to entity-fishing service")
177
+
178
+ # Combine entity information
179
+ combined_entity_info_dictionary = dict([(k, [entities_map[k], entities_data[k] if k in entities_data else None]) for k in entities_map])
180
+
181
+ if submit_button and entities_map:
182
+ # Prepare a list to hold the final output
183
+ final_text = []
184
+
185
+ # JSON-LD data
186
+ json_ld_data = {
187
+ "@context": "https://schema.org",
188
+ "@type": "WebPage",
189
+ "mentions": []
190
+ }
191
+
192
+ # Replace each entity in the text with its annotated version
193
+ for entity_string, entity_info in entities_map.items():
194
+ # Check if the entity has a valid Wikidata link
195
+ if entity_info["link"] is None or entity_info["link"] == "None":
196
+ continue # skip this entity
197
+
198
+ entity_data = entities_data.get(entity_string, None)
199
+ entity_type = None
200
+ if entity_data is not None:
201
+ entity_type = entity_data.get("@type", None)
202
+
203
+ # Use different colors based on the entity's type
204
+ color = "#8ef" # Default color
205
+ if entity_type == "Place":
206
+ color = "#8AC7DB"
207
+ elif entity_type == "Organization":
208
+ color = "#ADD8E6"
209
+ elif entity_type == "Person":
210
+ color = "#67B7D1"
211
+ elif entity_type == "Product":
212
+ color = "#2ea3f2"
213
+ elif entity_type == "CreativeWork":
214
+ color = "#00BFFF"
215
+ elif entity_type == "Event":
216
+ color = "#1E90FF"
217
+
218
+ entity_annotation = (entity_string, entity_info["id"], color)
219
+ text_input = text_input.replace(entity_string, f'{{{str(entity_annotation)}}}', 1)
220
 
221
+ # Add the entity to JSON-LD data
222
+ entity_json_ld = combined_entity_info_dictionary[entity_string][1]
223
+ if entity_json_ld and entity_json_ld.get("link") != "None":
224
+ json_ld_data["mentions"].append(entity_json_ld)
 
225
 
226
+ # Split the modified text_input into a list
227
+ text_list = text_input.split("{")
228
+
229
+ for item in text_list:
230
+ if "}" in item:
231
+ item_list = item.split("}")
232
+ try:
233
+ final_text.append(eval(item_list[0]))
234
+ except:
235
+ final_text.append(item_list[0])
236
+ if len(item_list) > 1 and len(item_list[1]) > 0:
237
+ final_text.append(item_list[1])
238
+ else:
239
+ final_text.append(item)
240
 
241
+ # Pass the final_text to the annotated_text function
242
+ annotated_text(*final_text)
243
+
244
+ with st.expander("See annotations"):
245
+ st.write(combined_entity_info_dictionary)
 
 
 
 
 
 
 
 
 
 
 
246
 
247
+ with st.expander("Here is the final JSON-LD"):
248
+ st.json(json_ld_data) # Output JSON-LD
249
+ elif submit_button and not entities_map:
250
+ st.warning("No entities found in the text. Please try with different text or check if the model is working correctly.")