Spaces:

Chemically-motivated
/

OSINT_Tool

Running

File size: 5,534 Bytes

import streamlit as st
import requests
import re
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch
import pandas as pd
from datasets import Dataset
from huggingface_hub import hf_api

# Title and description
st.title("OSINT Tool 🏢")
st.markdown("""
    This tool performs **Open Source Intelligence (OSINT)** analysis on GitHub repositories and fetches titles from URLs.
    It also allows uploading datasets (CSV format) for fine-tuning models like **DistilBERT**.
    """)

# Sidebar for navigation
st.sidebar.title("Navigation")
app_mode = st.sidebar.radio("Choose the mode", ["GitHub Repository Analysis", "URL Title Fetcher", "Dataset Upload & Fine-Tuning"])

# GitHub Repository Analysis
if app_mode == "GitHub Repository Analysis":
    st.header("GitHub Repository Analysis")
    repo_owner = st.text_input("Enter GitHub Repository Owner", "huggingface")
    repo_name = st.text_input("Enter GitHub Repository Name", "transformers")
    
    if st.button("Analyze Repository"):
        if repo_owner and repo_name:
            try:
                response = requests.get(f"https://api.github.com/repos/{repo_owner}/{repo_name}")
                data = response.json()
                
                if response.status_code == 200:
                    st.subheader("Repository Details")
                    st.write(f"**Name**: {data['name']}")
                    st.write(f"**Owner**: {data['owner']['login']}")
                    st.write(f"**Stars**: {data['stargazers_count']}")
                    st.write(f"**Forks**: {data['forks_count']}")
                    st.write(f"**Language**: {data['language']}")
                    st.write(f"**Description**: {data['description']}")
                else:
                    st.error(f"Error: {data.get('message', 'Something went wrong with the request')}")
            except Exception as e:
                st.error(f"Error occurred: {e}")
        else:
            st.warning("Please enter both repository owner and name.")

# URL Title Fetcher
elif app_mode == "URL Title Fetcher":
    st.header("URL Title Fetcher")
    url = st.text_input("Enter URL", "https://www.huggingface.co")
    
    if st.button("Fetch Title"):
        if url:
            try:
                response = requests.get(url)
                if response.status_code == 200:
                    # Try to extract the title from the HTML
                    match = re.search('<title>(.*?)</title>', response.text)
                    if match:
                        title = match.group(1)
                        st.write(f"**Page Title**: {title}")
                    else:
                        st.warning("Title tag not found in the page")
                else:
                    st.error(f"Failed to retrieve the page. Status code: {response.status_code}")
            except Exception as e:
                st.error(f"Error occurred: {e}")
        else:
            st.warning("Please enter a valid URL.")

# Dataset Upload & Fine-Tuning
elif app_mode == "Dataset Upload & Fine-Tuning":
    st.header("Dataset Upload & Fine-Tuning")
    
    uploaded_file = st.file_uploader("Upload a CSV file for fine-tuning", type="csv")
    
    if uploaded_file is not None:
        # Load the CSV into a pandas DataFrame
        df = pd.read_csv(uploaded_file)
        
        # Display dataset preview
        st.subheader("Dataset Preview")
        st.write(df.head())

        # Convert CSV to Hugging Face dataset format
        dataset = Dataset.from_pandas(df)
        
        model_name = st.selectbox("Select model for fine-tuning", ["distilbert-base-uncased"])
        
        if st.button("Fine-tune Model"):
            if model_name:
                try:
                    model = AutoModelForSequenceClassification.from_pretrained(model_name)
                    tokenizer = AutoTokenizer.from_pretrained(model_name)
                    
                    # Prepare the dataset
                    def preprocess_function(examples):
                        return tokenizer(examples['text'], truncation=True, padding=True)
                    
                    tokenized_datasets = dataset.map(preprocess_function, batched=True)
                    
                    # Fine-tuning setup (using Hugging Face Trainer for a complete setup)
                    from transformers import Trainer, TrainingArguments

                    training_args = TrainingArguments(
                        output_dir="./results",
                        evaluation_strategy="epoch",
                        learning_rate=2e-5,
                        per_device_train_batch_size=16,
                        per_device_eval_batch_size=16,
                        num_train_epochs=3,
                        weight_decay=0.01,
                    )
                    
                    trainer = Trainer(
                        model=model,
                        args=training_args,
                        train_dataset=tokenized_datasets,
                        eval_dataset=tokenized_datasets,
                    )
                    
                    # Train the model
                    trainer.train()

                    st.success("Fine-tuning completed successfully!")
                except Exception as e:
                    st.error(f"Error during fine-tuning: {e}")
            else:
                st.warning("Please select a model for fine-tuning.")
    
    else:
        st.warning("Please upload a dataset.")