Spaces:

WordLift
/

entity-linking

Running

App Files Files

cyberandy commited on 2 days ago

Commit

7d64ce5

verified ·

1 Parent(s): 82856f6

Update app.py

Browse files

Files changed (1) hide show

app.py +91 -85

app.py CHANGED Viewed

@@ -20,13 +20,15 @@ st.set_page_config(
 # Sidebar
 st.sidebar.image("logo-wordlift.png")
 language_options = {"English", "English - spaCy", "German"}
 selected_language = st.sidebar.selectbox("Select the Language", list(language_options), index=0)
 # Based on selected language, configure model, entity set, and citation options
 if selected_language == "German" or selected_language == "English - spaCy":
-    selected_model_name = None
-    selected_entity_set = None
     entity_fishing_citation = """
     @misc{entity-fishing,
     title = {entity-fishing},
@@ -36,10 +38,9 @@ if selected_language == "German" or selected_language == "English - spaCy":
     eprint = {1:dir:cb0ba3379413db12b0018b7c3af8d0d2d864139c}
     }
     """
     with st.sidebar.expander('Citations'):
         st.markdown(entity_fishing_citation)
-else:
     model_options = ["aida_model", "wikipedia_model_with_numbers"]
     entity_set_options = ["wikidata", "wikipedia"]
@@ -54,36 +55,42 @@ else:
     year = "2022"
     }
     """
     with st.sidebar.expander('Citations'):
         st.markdown(refined_citation)
-@st.cache_resource  # 👈 Add the caching decorator
 def load_model(selected_language, model_name=None, entity_set=None):
     if selected_language == "German":
         # Load the German-specific model
         nlp_model_de = spacy.load("de_core_news_lg")
-        nlp_model_de.add_pipe("entityfishing")
         return nlp_model_de
     elif selected_language == "English - spaCy":
         # Load English-specific model
         nlp_model_en = spacy.load("en_core_web_sm")
-        nlp_model_en.add_pipe("entityfishing")
-        return nlp_model_en
-    else:
         # Load the pretrained model for other languages
         refined_model = Refined.from_pretrained(model_name=model_name, entity_set=entity_set)
         return refined_model
 # Use the cached model
 model = load_model(selected_language, selected_model_name, selected_entity_set)
 # Helper functions
-def get_wikidata_id(entity_string):
-    entity_list = entity_string.split("=")
-    entity_id = str(entity_list[1])
     entity_link = "http://www.wikidata.org/entity/" + entity_id
     return {"id": entity_id, "link": entity_link}
@@ -92,14 +99,15 @@ def get_entity_data(entity_link):
         # Format the entity_link
         formatted_link = entity_link.replace("http://", "http/")
         response = requests.get(f'https://api.wordlift.io/id/{formatted_link}')
         return response.json()
-    except Exception as e:
-        print(f"Exception when fetching data for entity: {entity_link}. Exception: {e}")
         return None
 # Create the form
 with st.form(key='my_form'):
-    text_input = st.text_area(label='Enter a sentence')
     submit_button = st.form_submit_button(label='Analyze')
 # Initialization
@@ -109,92 +117,90 @@ entities_data = {}
 if text_input:
     if selected_language in ["German", "English - spaCy"]:
         doc = model(text_input)
-        entities = [(ent.text, ent.label_, ent._.kb_qid, ent._.url_wikidata) for ent in doc.ents]
-        for entity in entities:
             entity_string, entity_type, wikidata_id, wikidata_url = entity
             if wikidata_url:
-                # Ensure correct format for the German and English model
                 formatted_wikidata_url = wikidata_url.replace("https://www.wikidata.org/wiki/", "http://www.wikidata.org/entity/")
                 entities_map[entity_string] = {"id": wikidata_id, "link": formatted_wikidata_url}
                 entity_data = get_entity_data(formatted_wikidata_url)
                 if entity_data is not None:
                     entities_data[entity_string] = entity_data
-    else:
-        entities = model.process_text(text_input)
-        for entity in entities:
-            single_entity_list = str(entity).strip('][').replace("\'", "").split(', ')
-            if len(single_entity_list) >= 2 and "wikidata" in single_entity_list[1]:
-                entities_map[single_entity_list[0].strip()] = get_wikidata_id(single_entity_list[1])
-                entity_data = get_entity_data(entities_map[single_entity_list[0].strip()]["link"])
                 if entity_data is not None:
-                    entities_data[single_entity_list[0].strip()] = entity_data
-    combined_entity_info_dictionary = dict([(k, [entities_map[k], entities_data[k] if k in entities_data else None]) for k in entities_map])
     if submit_button:
-        # Prepare a list to hold the final output
         final_text = []
-        # JSON-LD data
         json_ld_data = {
                 "@context": "https://schema.org",
                 "@type": "WebPage",
                 "mentions": []
             }
-       # Replace each entity in the text with its annotated version
-        for entity_string, entity_info in entities_map.items():
-            # Check if the entity has a valid Wikidata link
-            if entity_info["link"] is None or entity_info["link"] == "None":
-                continue  # skip this entity
-            entity_data = entities_data.get(entity_string, None)
-            entity_type = None
-            if entity_data is not None:
-                entity_type = entity_data.get("@type", None)
-            # Use different colors based on the entity's type
-            color = "#8ef"  # Default color
-            if entity_type == "Place":
-                color = "#8AC7DB"
-            elif entity_type == "Organization":
-                color = "#ADD8E6"
-            elif entity_type == "Person":
-                color = "#67B7D1"
-            elif entity_type == "Product":
-                color = "#2ea3f2"
-            elif entity_type == "CreativeWork":
-                color = "#00BFFF"
-            elif entity_type == "Event":
-                color = "#1E90FF"
-            entity_annotation = (entity_string, entity_info["id"], color)
-            text_input = text_input.replace(entity_string, f'{{{str(entity_annotation)}}}', 1)
-            # Add the entity to JSON-LD data
-            entity_json_ld = combined_entity_info_dictionary[entity_string][1]
-            if entity_json_ld and entity_json_ld.get("link") != "None":
-                json_ld_data["mentions"].append(entity_json_ld)
-        # Split the modified text_input into a list
-        text_list = text_input.split("{")
-        for item in text_list:
-            if "}" in item:
-                item_list = item.split("}")
-                final_text.append(eval(item_list[0]))
-                if len(item_list[1]) > 0:
-                    final_text.append(item_list[1])
-            else:
-                final_text.append(item)
-        # Pass the final_text to the annotated_text function
-        annotated_text(*final_text)
         with st.expander("See annotations"):
             st.write(combined_entity_info_dictionary)
         with st.expander("Here is the final JSON-LD"):
-            st.json(json_ld_data)  # Output JSON-LD

 # Sidebar
 st.sidebar.image("logo-wordlift.png")
 language_options = {"English", "English - spaCy", "German"}
+# Set default to English to avoid an error on the first run
 selected_language = st.sidebar.selectbox("Select the Language", list(language_options), index=0)
+# Initialize model and entity set variables
+selected_model_name = None
+selected_entity_set = None
 # Based on selected language, configure model, entity set, and citation options
 if selected_language == "German" or selected_language == "English - spaCy":
     entity_fishing_citation = """
     @misc{entity-fishing,
     title = {entity-fishing},
     eprint = {1:dir:cb0ba3379413db12b0018b7c3af8d0d2d864139c}
     }
     """
     with st.sidebar.expander('Citations'):
         st.markdown(entity_fishing_citation)
+else: # English (Refined)
     model_options = ["aida_model", "wikipedia_model_with_numbers"]
     entity_set_options = ["wikidata", "wikipedia"]
     year = "2022"
     }
     """
     with st.sidebar.expander('Citations'):
         st.markdown(refined_citation)
+@st.cache_resource
 def load_model(selected_language, model_name=None, entity_set=None):
+    # Define the public URL for the entity-fishing service
+    entity_fishing_url = "https://cloud.science-miner.com/nerd/service"
     if selected_language == "German":
         # Load the German-specific model
         nlp_model_de = spacy.load("de_core_news_lg")
+        # Add the entity-fishing pipe with the server URL configured
+        nlp_model_de.add_pipe("entityfishing", config={"api_url": entity_fishing_url})
         return nlp_model_de
     elif selected_language == "English - spaCy":
         # Load English-specific model
         nlp_model_en = spacy.load("en_core_web_sm")
+        # Add the entity-fishing pipe with the server URL configured
+        nlp_model_en.add_pipe("entityfishing", config={"api_url": entity_fishing_url})
+        return nlp_model_en
+    else: # English (Refined)
         # Load the pretrained model for other languages
         refined_model = Refined.from_pretrained(model_name=model_name, entity_set=entity_set)
         return refined_model
 # Use the cached model
+# We pass the selected options directly to the cached function
+# Streamlit's caching handles re-running this only when the inputs change
 model = load_model(selected_language, selected_model_name, selected_entity_set)
 # Helper functions
+def get_wikidata_id(entity_id_string):
+    # Handles IDs like "wikidata:Q123" or "wikidata=Q123"
+    entity_id = entity_id_string.split(":")[-1].split("=")[-1]
     entity_link = "http://www.wikidata.org/entity/" + entity_id
     return {"id": entity_id, "link": entity_link}
         # Format the entity_link
         formatted_link = entity_link.replace("http://", "http/")
         response = requests.get(f'https://api.wordlift.io/id/{formatted_link}')
+        response.raise_for_status() # Raise an exception for bad status codes
         return response.json()
+    except requests.exceptions.RequestException as e:
+        st.warning(f"Could not fetch data for entity: {entity_link}. Error: {e}")
         return None
 # Create the form
 with st.form(key='my_form'):
+    text_input = st.text_area(label='Enter a sentence', value="Angela Merkel was the first female chancellor of Germany.")
     submit_button = st.form_submit_button(label='Analyze')
 # Initialization
 if text_input:
     if selected_language in ["German", "English - spaCy"]:
         doc = model(text_input)
+        spacy_entities = [(ent.text, ent.label_, ent._.kb_qid, ent._.url_wikidata) for ent in doc.ents]
+        for entity in spacy_entities:
             entity_string, entity_type, wikidata_id, wikidata_url = entity
             if wikidata_url:
                 formatted_wikidata_url = wikidata_url.replace("https://www.wikidata.org/wiki/", "http://www.wikidata.org/entity/")
                 entities_map[entity_string] = {"id": wikidata_id, "link": formatted_wikidata_url}
                 entity_data = get_entity_data(formatted_wikidata_url)
                 if entity_data is not None:
                     entities_data[entity_string] = entity_data
+    else: # Refined model
+        refined_entities = model.process_text(text_input)
+        for entity in refined_entities:
+            # More robustly access entity attributes instead of parsing a string
+            if entity.entity_id and "wikidata" in entity.entity_id:
+                entity_text = entity.text
+                wikidata_info = get_wikidata_id(entity.entity_id)
+                entities_map[entity_text] = wikidata_info
+                entity_data = get_entity_data(wikidata_info["link"])
                 if entity_data is not None:
+                    entities_data[entity_text] = entity_data
+    combined_entity_info_dictionary = {
+        k: [entities_map[k], entities_data.get(k)] for k in entities_map
+    }
     if submit_button:
+        # A more robust way to build the annotated_text list without using eval()
         final_text = []
+        current_pos = 0
+        # Create a simple list of (text, start, end) for sorting
+        entity_spans = []
+        if selected_language in ["German", "English - spaCy"]:
+            # 'doc' is available from the processing block above
+            for ent in doc.ents:
+                if ent.text in entities_map: # only include linked entities
+                    entity_spans.append((ent.text, ent.start_char, ent.end_char))
+        else:
+            # 'refined_entities' is available
+             for ent in refined_entities:
+                if ent.text in entities_map:
+                    entity_spans.append((ent.text, ent.span[0], ent.span[1]))
+        # Sort entities by their starting position to handle the text correctly
+        sorted_entities = sorted(entity_spans, key=lambda x: x[1])
+        for entity_string, start, end in sorted_entities:
+            # Add the text segment before the current entity
+            final_text.append(text_input[current_pos:start])
+            # Prepare the annotation for the entity
+            entity_info = entities_map.get(entity_string, {})
+            entity_id = entity_info.get("id", "N/A")
+            entity_type_data = entities_data.get(entity_string)
+            entity_type = entity_type_data.get("@type") if entity_type_data else None
+            color = {"Place": "#8AC7DB", "Organization": "#ADD8E6", "Person": "#67B7D1",
+                     "Product": "#2ea3f2", "CreativeWork": "#00BFFF", "Event": "#1E90FF"}.get(entity_type, "#8ef")
+            final_text.append((entity_string, entity_id, color))
+            current_pos = end
+        # Add any remaining text after the last entity
+        final_text.append(text_input[current_pos:])
+        st.header("Annotated Text")
+        annotated_text(*[item for item in final_text if item]) # Filter out empty strings
+        # --- JSON-LD Generation ---
         json_ld_data = {
                 "@context": "https://schema.org",
                 "@type": "WebPage",
                 "mentions": []
             }
+        for entity_string, info_list in combined_entity_info_dictionary.items():
+            entity_json_ld = info_list[1] # The data from WordLift API
+            if entity_json_ld:
+                 json_ld_data["mentions"].append(entity_json_ld)
         with st.expander("See annotations"):
             st.write(combined_entity_info_dictionary)
         with st.expander("Here is the final JSON-LD"):
+            st.json(json_ld_data)