PuoBERTaSpace

Running

App Files Files Community

PuoBERTaSpace / app.py

vukosi

Update app.py

4948d8f verified 4 months ago

raw

history blame

4.36 kB

	# Refactored Streamlit App for Setswana NER using HuggingFace Models

	import streamlit as st
	from transformers import pipeline, AutoModelForTokenClassification, AutoTokenizer
	import pandas as pd
	import spacy

	# -------------------- PAGE CONFIG --------------------
	st.set_page_config(layout="wide")

	# -------------------- UI HEADER --------------------
	st.image("logo_transparent_small.png", use_column_width="always")
	st.title("Demo for Setswana PuoBERTa NER Model")

	# -------------------- MODEL SELECTION --------------------
	model_list = ['dsfsi/PuoBERTa-NER']
	model_checkpoint = st.sidebar.radio("Select NER Model", model_list)
	aggregation_strategy = "simple"

	# -------------------- TEXT INPUT --------------------
	input_method = st.radio("Select Input Method", ['Example Text', 'Write Text', 'Upload CSV'])

	def get_input_text():
	if input_method == 'Example Text':
	examples = [
	"Moso ono mo dikgang tsa ura le ura, o tsoga le Oarabile Moamogwe go simolola ka 05:00 - 10:00"
	]
	return st.selectbox("Example Sentences", examples)
	elif input_method == 'Write Text':
	return st.text_area("Enter text", height=128)
	elif input_method == 'Upload CSV':
	uploaded = st.file_uploader("Upload CSV", type="csv")
	if uploaded:
	df = pd.read_csv(uploaded)
	col = st.selectbox("Choose column with text", df.columns)
	return "\n".join(df[col].dropna().astype(str).tolist())
	return ""

	input_text = get_input_text()

	# -------------------- MODEL LOADING --------------------
	@st.cache_resource
	def load_ner_pipeline(model_checkpoint, strategy):
	tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
	model = AutoModelForTokenClassification.from_pretrained(model_checkpoint)
	return pipeline("token-classification", model=model, tokenizer=tokenizer, aggregation_strategy=strategy)

	# -------------------- ENTITY MERGE --------------------
	def merge_entities(output):
	merged = []
	for i, ent in enumerate(output):
	if i > 0 and ent["start"] == output[i-1]["end"] and ent["entity_group"] == output[i-1]["entity_group"]:
	merged[-1]["word"] += ent["word"]
	merged[-1]["end"] = ent["end"]
	else:
	merged.append(ent)
	return merged

	# -------------------- RUN NER --------------------
	if st.button("Run NER") and input_text.strip():
	with st.spinner("Running NER..."):
	ner = load_ner_pipeline(model_checkpoint, aggregation_strategy)
	output = ner(input_text)
	entities = merge_entities(output)

	if entities:
	df = pd.DataFrame(entities)[['word','entity_group','score','start','end']]
	st.subheader("Recognized Entities")
	st.dataframe(df)

	spacy_display = {"text": input_text, "ents": [], "title": None}
	for ent in entities:
	label = ent["entity_group"]
	if label == "PER":
	label = "PERSON"
	spacy_display["ents"].append({"start": ent["start"], "end": ent["end"], "label": label})

	html = spacy.displacy.render(spacy_display, style="ent", manual=True, minify=True)
	styled_html = f"<style>mark.entity {{ display: inline-block; }}</style><div style='overflow-x:auto;'>{html}</div>"
	st.markdown(styled_html, unsafe_allow_html=True)
	else:
	st.info("No entities recognized in the input.")

	# -------------------- AUTHORS, CITATION & FEEDBACK --------------------
	st.markdown("""
	---
	### 📚 Authors & Citation

	Authors
	Vukosi Marivate, Moseli Mots'Oehli, Valencia Wagner, Richard Lastrucci, Isheanesu Dzingirai

	Citation
	```bibtex
	@inproceedings{marivate2023puoberta,
	title = {PuoBERTa: Training and evaluation of a curated language model for Setswana},
	author = {Vukosi Marivate and Moseli Mots'Oehli and Valencia Wagner and Richard Lastrucci and Isheanesu Dzingirai},
	year = {2023},
	booktitle= {Artificial Intelligence Research. SACAIR 2023. Communications in Computer and Information Science},
	url= {https://link.springer.com/chapter/10.1007/978-3-031-49002-6_17},
	keywords = {NLP},
	preprint_url = {https://arxiv.org/abs/2310.09141},
	dataset_url = {https://github.com/dsfsi/PuoBERTa},
	software_url = {https://huggingface.co/dsfsi/PuoBERTa}
	}""")