import nltk
import re
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from flask import Flask, request, render_template

app = Flask(__name__)

nltk.download('stopwords')
stop_words = set(stopwords.words("english"))

@app.route('/')
def index():
    return render_template('index.html')

@app.route('/detect', methods=['POST'])
def detect_ai():
    content = request.form['content']
    # Tokenize the content into sentences
    sentences = sent_tokenize(content)
    # Check for the number of sentences
    if len(sentences) < 3:
        return "This content is likely generated by AI"
    # Tokenize each sentence into words
    words = [word_tokenize(sentence) for sentence in sentences]
    # Remove stop words and special characters
    words = [[word.lower() for word in sentence if word.isalpha() and word.lower() not in stop_words] for sentence in words]
    # Create a frequency distribution of the words
    fdist = FreqDist([word for sentence in words for word in sentence])
    # Check the average frequency of words in the content
    avg_freq = sum(fdist.values())/len(fdist)
    if avg_freq < 2:
        return "This content is likely generated by AI"
    # Check for the use of common regex patterns
    regex_patterns = [r'\b\w{5,}\b', r'\b\d{1,}\b', r'\b\w{5,}\b \b\w{5,}\b']
    for pattern in regex_patterns:
        if re.search(pattern, content):
            return "This content is likely generated by AI"
    return "This content is likely not generated by AI"

if __name__ == '__main__':
    app.run(host='0.0.0.0',port=7860)