Spaces:
Running
Running
File size: 5,534 Bytes
c954503 e511bc5 bbbca4f e511bc5 c954503 e511bc5 bbbca4f e511bc5 bbbca4f e511bc5 6a46dba e511bc5 bbbca4f e511bc5 bbbca4f e511bc5 bbbca4f e511bc5 6a09dd7 e511bc5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 |
import streamlit as st
import requests
import re
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch
import pandas as pd
from datasets import Dataset
from huggingface_hub import hf_api
# Title and description
st.title("OSINT Tool 🏢")
st.markdown("""
This tool performs **Open Source Intelligence (OSINT)** analysis on GitHub repositories and fetches titles from URLs.
It also allows uploading datasets (CSV format) for fine-tuning models like **DistilBERT**.
""")
# Sidebar for navigation
st.sidebar.title("Navigation")
app_mode = st.sidebar.radio("Choose the mode", ["GitHub Repository Analysis", "URL Title Fetcher", "Dataset Upload & Fine-Tuning"])
# GitHub Repository Analysis
if app_mode == "GitHub Repository Analysis":
st.header("GitHub Repository Analysis")
repo_owner = st.text_input("Enter GitHub Repository Owner", "huggingface")
repo_name = st.text_input("Enter GitHub Repository Name", "transformers")
if st.button("Analyze Repository"):
if repo_owner and repo_name:
try:
response = requests.get(f"https://api.github.com/repos/{repo_owner}/{repo_name}")
data = response.json()
if response.status_code == 200:
st.subheader("Repository Details")
st.write(f"**Name**: {data['name']}")
st.write(f"**Owner**: {data['owner']['login']}")
st.write(f"**Stars**: {data['stargazers_count']}")
st.write(f"**Forks**: {data['forks_count']}")
st.write(f"**Language**: {data['language']}")
st.write(f"**Description**: {data['description']}")
else:
st.error(f"Error: {data.get('message', 'Something went wrong with the request')}")
except Exception as e:
st.error(f"Error occurred: {e}")
else:
st.warning("Please enter both repository owner and name.")
# URL Title Fetcher
elif app_mode == "URL Title Fetcher":
st.header("URL Title Fetcher")
url = st.text_input("Enter URL", "https://www.huggingface.co")
if st.button("Fetch Title"):
if url:
try:
response = requests.get(url)
if response.status_code == 200:
# Try to extract the title from the HTML
match = re.search('<title>(.*?)</title>', response.text)
if match:
title = match.group(1)
st.write(f"**Page Title**: {title}")
else:
st.warning("Title tag not found in the page")
else:
st.error(f"Failed to retrieve the page. Status code: {response.status_code}")
except Exception as e:
st.error(f"Error occurred: {e}")
else:
st.warning("Please enter a valid URL.")
# Dataset Upload & Fine-Tuning
elif app_mode == "Dataset Upload & Fine-Tuning":
st.header("Dataset Upload & Fine-Tuning")
uploaded_file = st.file_uploader("Upload a CSV file for fine-tuning", type="csv")
if uploaded_file is not None:
# Load the CSV into a pandas DataFrame
df = pd.read_csv(uploaded_file)
# Display dataset preview
st.subheader("Dataset Preview")
st.write(df.head())
# Convert CSV to Hugging Face dataset format
dataset = Dataset.from_pandas(df)
model_name = st.selectbox("Select model for fine-tuning", ["distilbert-base-uncased"])
if st.button("Fine-tune Model"):
if model_name:
try:
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
# Prepare the dataset
def preprocess_function(examples):
return tokenizer(examples['text'], truncation=True, padding=True)
tokenized_datasets = dataset.map(preprocess_function, batched=True)
# Fine-tuning setup (using Hugging Face Trainer for a complete setup)
from transformers import Trainer, TrainingArguments
training_args = TrainingArguments(
output_dir="./results",
evaluation_strategy="epoch",
learning_rate=2e-5,
per_device_train_batch_size=16,
per_device_eval_batch_size=16,
num_train_epochs=3,
weight_decay=0.01,
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_datasets,
eval_dataset=tokenized_datasets,
)
# Train the model
trainer.train()
st.success("Fine-tuning completed successfully!")
except Exception as e:
st.error(f"Error during fine-tuning: {e}")
else:
st.warning("Please select a model for fine-tuning.")
else:
st.warning("Please upload a dataset.")
|