Spaces:
Running
Running
# Refactored Streamlit App for Setswana NER using HuggingFace Models | |
import streamlit as st | |
from transformers import pipeline, AutoModelForTokenClassification, AutoTokenizer | |
import pandas as pd | |
import spacy | |
# -------------------- PAGE CONFIG -------------------- | |
st.set_page_config(layout="wide") | |
# -------------------- UI HEADER -------------------- | |
st.image("logo_transparent_small.png", use_column_width="always") | |
st.title("Demo for Setswana PuoBERTa NER Model") | |
# -------------------- MODEL SELECTION -------------------- | |
model_list = ['dsfsi/PuoBERTa-NER'] | |
model_checkpoint = st.sidebar.radio("Select NER Model", model_list) | |
aggregation_strategy = "simple" | |
# -------------------- TEXT INPUT -------------------- | |
input_method = st.radio("Select Input Method", ['Example Text', 'Write Text', 'Upload CSV']) | |
def get_input_text(): | |
if input_method == 'Example Text': | |
examples = [ | |
"Moso ono mo dikgang tsa ura le ura, o tsoga le Oarabile Moamogwe go simolola ka 05:00 - 10:00" | |
] | |
return st.selectbox("Example Sentences", examples) | |
elif input_method == 'Write Text': | |
return st.text_area("Enter text", height=128) | |
elif input_method == 'Upload CSV': | |
uploaded = st.file_uploader("Upload CSV", type="csv") | |
if uploaded: | |
df = pd.read_csv(uploaded) | |
col = st.selectbox("Choose column with text", df.columns) | |
return "\n".join(df[col].dropna().astype(str).tolist()) | |
return "" | |
input_text = get_input_text() | |
# -------------------- MODEL LOADING -------------------- | |
def load_ner_pipeline(model_checkpoint, strategy): | |
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint) | |
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint) | |
return pipeline("token-classification", model=model, tokenizer=tokenizer, aggregation_strategy=strategy) | |
# -------------------- ENTITY MERGE -------------------- | |
def merge_entities(output): | |
merged = [] | |
for i, ent in enumerate(output): | |
if i > 0 and ent["start"] == output[i-1]["end"] and ent["entity_group"] == output[i-1]["entity_group"]: | |
merged[-1]["word"] += ent["word"] | |
merged[-1]["end"] = ent["end"] | |
else: | |
merged.append(ent) | |
return merged | |
# -------------------- RUN NER -------------------- | |
if st.button("Run NER") and input_text.strip(): | |
with st.spinner("Running NER..."): | |
ner = load_ner_pipeline(model_checkpoint, aggregation_strategy) | |
output = ner(input_text) | |
entities = merge_entities(output) | |
if entities: | |
df = pd.DataFrame(entities)[['word','entity_group','score','start','end']] | |
st.subheader("Recognized Entities") | |
st.dataframe(df) | |
spacy_display = {"text": input_text, "ents": [], "title": None} | |
for ent in entities: | |
label = ent["entity_group"] | |
if label == "PER": | |
label = "PERSON" | |
spacy_display["ents"].append({"start": ent["start"], "end": ent["end"], "label": label}) | |
html = spacy.displacy.render(spacy_display, style="ent", manual=True, minify=True) | |
styled_html = f"<style>mark.entity {{ display: inline-block; }}</style><div style='overflow-x:auto;'>{html}</div>" | |
st.markdown(styled_html, unsafe_allow_html=True) | |
else: | |
st.info("No entities recognized in the input.") | |
# -------------------- AUTHORS, CITATION & FEEDBACK -------------------- | |
st.markdown(""" | |
--- | |
### 📚 Authors & Citation | |
**Authors** | |
Vukosi Marivate, Moseli Mots'Oehli, Valencia Wagner, Richard Lastrucci, Isheanesu Dzingirai | |
**Citation** | |
```bibtex | |
@inproceedings{marivate2023puoberta, | |
title = {PuoBERTa: Training and evaluation of a curated language model for Setswana}, | |
author = {Vukosi Marivate and Moseli Mots'Oehli and Valencia Wagner and Richard Lastrucci and Isheanesu Dzingirai}, | |
year = {2023}, | |
booktitle= {Artificial Intelligence Research. SACAIR 2023. Communications in Computer and Information Science}, | |
url= {https://link.springer.com/chapter/10.1007/978-3-031-49002-6_17}, | |
keywords = {NLP}, | |
preprint_url = {https://arxiv.org/abs/2310.09141}, | |
dataset_url = {https://github.com/dsfsi/PuoBERTa}, | |
software_url = {https://huggingface.co/dsfsi/PuoBERTa} | |
}""") |