Spaces:
Running
Running
import streamlit as st | |
import requests | |
import re | |
from transformers import AutoModelForSequenceClassification, AutoTokenizer | |
import torch | |
import pandas as pd | |
from datasets import Dataset | |
from huggingface_hub import hf_api | |
# Title and description | |
st.title("OSINT Tool 🏢") | |
st.markdown(""" | |
This tool performs **Open Source Intelligence (OSINT)** analysis on GitHub repositories and fetches titles from URLs. | |
It also allows uploading datasets (CSV format) for fine-tuning models like **DistilBERT**. | |
""") | |
# Sidebar for navigation | |
st.sidebar.title("Navigation") | |
app_mode = st.sidebar.radio("Choose the mode", ["GitHub Repository Analysis", "URL Title Fetcher", "Dataset Upload & Fine-Tuning"]) | |
# GitHub Repository Analysis | |
if app_mode == "GitHub Repository Analysis": | |
st.header("GitHub Repository Analysis") | |
repo_owner = st.text_input("Enter GitHub Repository Owner", "huggingface") | |
repo_name = st.text_input("Enter GitHub Repository Name", "transformers") | |
if st.button("Analyze Repository"): | |
if repo_owner and repo_name: | |
try: | |
response = requests.get(f"https://api.github.com/repos/{repo_owner}/{repo_name}") | |
data = response.json() | |
if response.status_code == 200: | |
st.subheader("Repository Details") | |
st.write(f"**Name**: {data['name']}") | |
st.write(f"**Owner**: {data['owner']['login']}") | |
st.write(f"**Stars**: {data['stargazers_count']}") | |
st.write(f"**Forks**: {data['forks_count']}") | |
st.write(f"**Language**: {data['language']}") | |
st.write(f"**Description**: {data['description']}") | |
else: | |
st.error(f"Error: {data.get('message', 'Something went wrong with the request')}") | |
except Exception as e: | |
st.error(f"Error occurred: {e}") | |
else: | |
st.warning("Please enter both repository owner and name.") | |
# URL Title Fetcher | |
elif app_mode == "URL Title Fetcher": | |
st.header("URL Title Fetcher") | |
url = st.text_input("Enter URL", "https://www.huggingface.co") | |
if st.button("Fetch Title"): | |
if url: | |
try: | |
response = requests.get(url) | |
if response.status_code == 200: | |
# Try to extract the title from the HTML | |
match = re.search('<title>(.*?)</title>', response.text) | |
if match: | |
title = match.group(1) | |
st.write(f"**Page Title**: {title}") | |
else: | |
st.warning("Title tag not found in the page") | |
else: | |
st.error(f"Failed to retrieve the page. Status code: {response.status_code}") | |
except Exception as e: | |
st.error(f"Error occurred: {e}") | |
else: | |
st.warning("Please enter a valid URL.") | |
# Dataset Upload & Fine-Tuning | |
elif app_mode == "Dataset Upload & Fine-Tuning": | |
st.header("Dataset Upload & Fine-Tuning") | |
uploaded_file = st.file_uploader("Upload a CSV file for fine-tuning", type="csv") | |
if uploaded_file is not None: | |
# Load the CSV into a pandas DataFrame | |
df = pd.read_csv(uploaded_file) | |
# Display dataset preview | |
st.subheader("Dataset Preview") | |
st.write(df.head()) | |
# Convert CSV to Hugging Face dataset format | |
dataset = Dataset.from_pandas(df) | |
model_name = st.selectbox("Select model for fine-tuning", ["distilbert-base-uncased"]) | |
if st.button("Fine-tune Model"): | |
if model_name: | |
try: | |
model = AutoModelForSequenceClassification.from_pretrained(model_name) | |
tokenizer = AutoTokenizer.from_pretrained(model_name) | |
# Prepare the dataset | |
def preprocess_function(examples): | |
return tokenizer(examples['text'], truncation=True, padding=True) | |
tokenized_datasets = dataset.map(preprocess_function, batched=True) | |
# Fine-tuning setup (using Hugging Face Trainer for a complete setup) | |
from transformers import Trainer, TrainingArguments | |
training_args = TrainingArguments( | |
output_dir="./results", | |
evaluation_strategy="epoch", | |
learning_rate=2e-5, | |
per_device_train_batch_size=16, | |
per_device_eval_batch_size=16, | |
num_train_epochs=3, | |
weight_decay=0.01, | |
) | |
trainer = Trainer( | |
model=model, | |
args=training_args, | |
train_dataset=tokenized_datasets, | |
eval_dataset=tokenized_datasets, | |
) | |
# Train the model | |
trainer.train() | |
st.success("Fine-tuning completed successfully!") | |
except Exception as e: | |
st.error(f"Error during fine-tuning: {e}") | |
else: | |
st.warning("Please select a model for fine-tuning.") | |
else: | |
st.warning("Please upload a dataset.") | |