PuoBERTaSpace / app.py
vukosi's picture
Update app.py
4948d8f verified
raw
history blame
4.36 kB
# Refactored Streamlit App for Setswana NER using HuggingFace Models
import streamlit as st
from transformers import pipeline, AutoModelForTokenClassification, AutoTokenizer
import pandas as pd
import spacy
# -------------------- PAGE CONFIG --------------------
st.set_page_config(layout="wide")
# -------------------- UI HEADER --------------------
st.image("logo_transparent_small.png", use_column_width="always")
st.title("Demo for Setswana PuoBERTa NER Model")
# -------------------- MODEL SELECTION --------------------
model_list = ['dsfsi/PuoBERTa-NER']
model_checkpoint = st.sidebar.radio("Select NER Model", model_list)
aggregation_strategy = "simple"
# -------------------- TEXT INPUT --------------------
input_method = st.radio("Select Input Method", ['Example Text', 'Write Text', 'Upload CSV'])
def get_input_text():
if input_method == 'Example Text':
examples = [
"Moso ono mo dikgang tsa ura le ura, o tsoga le Oarabile Moamogwe go simolola ka 05:00 - 10:00"
]
return st.selectbox("Example Sentences", examples)
elif input_method == 'Write Text':
return st.text_area("Enter text", height=128)
elif input_method == 'Upload CSV':
uploaded = st.file_uploader("Upload CSV", type="csv")
if uploaded:
df = pd.read_csv(uploaded)
col = st.selectbox("Choose column with text", df.columns)
return "\n".join(df[col].dropna().astype(str).tolist())
return ""
input_text = get_input_text()
# -------------------- MODEL LOADING --------------------
@st.cache_resource
def load_ner_pipeline(model_checkpoint, strategy):
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint)
return pipeline("token-classification", model=model, tokenizer=tokenizer, aggregation_strategy=strategy)
# -------------------- ENTITY MERGE --------------------
def merge_entities(output):
merged = []
for i, ent in enumerate(output):
if i > 0 and ent["start"] == output[i-1]["end"] and ent["entity_group"] == output[i-1]["entity_group"]:
merged[-1]["word"] += ent["word"]
merged[-1]["end"] = ent["end"]
else:
merged.append(ent)
return merged
# -------------------- RUN NER --------------------
if st.button("Run NER") and input_text.strip():
with st.spinner("Running NER..."):
ner = load_ner_pipeline(model_checkpoint, aggregation_strategy)
output = ner(input_text)
entities = merge_entities(output)
if entities:
df = pd.DataFrame(entities)[['word','entity_group','score','start','end']]
st.subheader("Recognized Entities")
st.dataframe(df)
spacy_display = {"text": input_text, "ents": [], "title": None}
for ent in entities:
label = ent["entity_group"]
if label == "PER":
label = "PERSON"
spacy_display["ents"].append({"start": ent["start"], "end": ent["end"], "label": label})
html = spacy.displacy.render(spacy_display, style="ent", manual=True, minify=True)
styled_html = f"<style>mark.entity {{ display: inline-block; }}</style><div style='overflow-x:auto;'>{html}</div>"
st.markdown(styled_html, unsafe_allow_html=True)
else:
st.info("No entities recognized in the input.")
# -------------------- AUTHORS, CITATION & FEEDBACK --------------------
st.markdown("""
---
### 📚 Authors & Citation
**Authors**
Vukosi Marivate, Moseli Mots'Oehli, Valencia Wagner, Richard Lastrucci, Isheanesu Dzingirai
**Citation**
```bibtex
@inproceedings{marivate2023puoberta,
title = {PuoBERTa: Training and evaluation of a curated language model for Setswana},
author = {Vukosi Marivate and Moseli Mots'Oehli and Valencia Wagner and Richard Lastrucci and Isheanesu Dzingirai},
year = {2023},
booktitle= {Artificial Intelligence Research. SACAIR 2023. Communications in Computer and Information Science},
url= {https://link.springer.com/chapter/10.1007/978-3-031-49002-6_17},
keywords = {NLP},
preprint_url = {https://arxiv.org/abs/2310.09141},
dataset_url = {https://github.com/dsfsi/PuoBERTa},
software_url = {https://huggingface.co/dsfsi/PuoBERTa}
}""")