OSINT_Tool / app.py
Canstralian's picture
Update app.py
bbbca4f verified
raw
history blame
5.53 kB
import streamlit as st
import requests
import re
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch
import pandas as pd
from datasets import Dataset
from huggingface_hub import hf_api
# Title and description
st.title("OSINT Tool 🏢")
st.markdown("""
This tool performs **Open Source Intelligence (OSINT)** analysis on GitHub repositories and fetches titles from URLs.
It also allows uploading datasets (CSV format) for fine-tuning models like **DistilBERT**.
""")
# Sidebar for navigation
st.sidebar.title("Navigation")
app_mode = st.sidebar.radio("Choose the mode", ["GitHub Repository Analysis", "URL Title Fetcher", "Dataset Upload & Fine-Tuning"])
# GitHub Repository Analysis
if app_mode == "GitHub Repository Analysis":
st.header("GitHub Repository Analysis")
repo_owner = st.text_input("Enter GitHub Repository Owner", "huggingface")
repo_name = st.text_input("Enter GitHub Repository Name", "transformers")
if st.button("Analyze Repository"):
if repo_owner and repo_name:
try:
response = requests.get(f"https://api.github.com/repos/{repo_owner}/{repo_name}")
data = response.json()
if response.status_code == 200:
st.subheader("Repository Details")
st.write(f"**Name**: {data['name']}")
st.write(f"**Owner**: {data['owner']['login']}")
st.write(f"**Stars**: {data['stargazers_count']}")
st.write(f"**Forks**: {data['forks_count']}")
st.write(f"**Language**: {data['language']}")
st.write(f"**Description**: {data['description']}")
else:
st.error(f"Error: {data.get('message', 'Something went wrong with the request')}")
except Exception as e:
st.error(f"Error occurred: {e}")
else:
st.warning("Please enter both repository owner and name.")
# URL Title Fetcher
elif app_mode == "URL Title Fetcher":
st.header("URL Title Fetcher")
url = st.text_input("Enter URL", "https://www.huggingface.co")
if st.button("Fetch Title"):
if url:
try:
response = requests.get(url)
if response.status_code == 200:
# Try to extract the title from the HTML
match = re.search('<title>(.*?)</title>', response.text)
if match:
title = match.group(1)
st.write(f"**Page Title**: {title}")
else:
st.warning("Title tag not found in the page")
else:
st.error(f"Failed to retrieve the page. Status code: {response.status_code}")
except Exception as e:
st.error(f"Error occurred: {e}")
else:
st.warning("Please enter a valid URL.")
# Dataset Upload & Fine-Tuning
elif app_mode == "Dataset Upload & Fine-Tuning":
st.header("Dataset Upload & Fine-Tuning")
uploaded_file = st.file_uploader("Upload a CSV file for fine-tuning", type="csv")
if uploaded_file is not None:
# Load the CSV into a pandas DataFrame
df = pd.read_csv(uploaded_file)
# Display dataset preview
st.subheader("Dataset Preview")
st.write(df.head())
# Convert CSV to Hugging Face dataset format
dataset = Dataset.from_pandas(df)
model_name = st.selectbox("Select model for fine-tuning", ["distilbert-base-uncased"])
if st.button("Fine-tune Model"):
if model_name:
try:
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
# Prepare the dataset
def preprocess_function(examples):
return tokenizer(examples['text'], truncation=True, padding=True)
tokenized_datasets = dataset.map(preprocess_function, batched=True)
# Fine-tuning setup (using Hugging Face Trainer for a complete setup)
from transformers import Trainer, TrainingArguments
training_args = TrainingArguments(
output_dir="./results",
evaluation_strategy="epoch",
learning_rate=2e-5,
per_device_train_batch_size=16,
per_device_eval_batch_size=16,
num_train_epochs=3,
weight_decay=0.01,
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_datasets,
eval_dataset=tokenized_datasets,
)
# Train the model
trainer.train()
st.success("Fine-tuning completed successfully!")
except Exception as e:
st.error(f"Error during fine-tuning: {e}")
else:
st.warning("Please select a model for fine-tuning.")
else:
st.warning("Please upload a dataset.")