sebastiansarasti commited on
Commit
fa08326
·
0 Parent(s):

first commit

Browse files
.github/workflows/hugggingface.yaml ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Sync to Hugging Face hub
2
+ on:
3
+ push:
4
+ branches: [main]
5
+
6
+ jobs:
7
+ sync-to-hub:
8
+ runs-on: ubuntu-latest
9
+ steps:
10
+ - uses: actions/checkout@v3
11
+ with:
12
+ fetch-depth: 0
13
+ lfs: true
14
+ - name: Push to hub
15
+ env:
16
+ HF_TOKEN: ${{ secrets.HF_TOKEN }}
17
+ run: git push --force https://${{ secrets.HF_USERNAME }}:$HF_TOKEN@huggingface.co/spaces/${{ secrets.HF_USERNAME }}/${{ secrets.SPACE_NAME }} main
Dockerfile ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11-slim
2
+
3
+ # Set working directory
4
+ WORKDIR /app
5
+
6
+ # Copy files
7
+ COPY src/app ./src/app
8
+
9
+ # Install uv and Python packages
10
+ RUN pip install uv
11
+ RUN uv pip install --system -r /src/app/requirements.txt
12
+
13
+ # Create non-root user and give permissions
14
+ RUN useradd -m appuser && \
15
+ mkdir -p /app/cache /app/.streamlit && \
16
+ chown -R appuser:appuser /app
17
+
18
+ # Set environment variables for Hugging Face and Streamlit
19
+ ENV HF_HOME=/app/cache
20
+ ENV STREAMLIT_CONFIG_DIR=/app/.streamlit
21
+
22
+ # Switch to non-root user
23
+ USER appuser
24
+
25
+ # Expose Streamlit port
26
+ EXPOSE 8501
27
+
28
+ # Healthcheck
29
+ HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health || exit 1
30
+
31
+ # Run Streamlit app
32
+ ENTRYPOINT ["streamlit", "run", "src/app/main.py", "--server.port=8501", "--server.address=0.0.0.0"]
33
+
README.md ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: NLP conference Crossbridge
3
+ app_port: 8501
4
+ emoji: 🈂️
5
+ colorFrom: gray
6
+ colorTo: purple
7
+ sdk: docker
8
+ pinned: false
9
+ license: mit
10
+ short_description: Traditional NLP for AI written detection
11
+ ---
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ numpy == 1.26.4
2
+ pandas == 2.2.0
3
+ pyarrow == 15.0.0
4
+ fastparquet == 2024.2.0
5
+ mlflow == 2.10.2
6
+ nltk == 3.8.1
7
+ seaborn == 0.13.2
8
+ matplotlib == 3.8.2
9
+ python-dotenv == 1.0.1
src/app/__init__.py ADDED
File without changes
src/app/__pycache__/pipelines.cpython-311.pyc ADDED
Binary file (2.64 kB). View file
 
src/app/__pycache__/xai.cpython-311.pyc ADDED
Binary file (1.66 kB). View file
 
src/app/main.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import sys
3
+ from pipelines import pipeline_inference
4
+ from xai import get_explanation
5
+ import time
6
+ import pandas as pd
7
+ import plotly.express as px
8
+
9
+ import nltk
10
+
11
+ nltk.download('stopwords')
12
+
13
+
14
+ st.title('Text identification app')
15
+
16
+ st.subheader('This app is designed to identify if a text was written by a human or an AI')
17
+ st.markdown('In many cases, using AI is not a suitable solution because this does not allow to develop creativity and innovation in written assessments')
18
+
19
+ col1, col2 = st.columns(2)
20
+ with col1:
21
+ a = st.button('Classify text')
22
+ with col2:
23
+ xai_option = st.toggle('Explain the classification', value = False)
24
+
25
+ with st.sidebar:
26
+ st.subheader('About the App')
27
+ st.markdown('Data used for the training come from the following source: https://www.kaggle.com/datasets/shanegerami/ai-vs-human-text')
28
+ st.markdown('The model built is not based on transformer architecture, it uses traditional Natural Language Processing techniques')
29
+ st.empty()
30
+ st.subheader('Author')
31
+ st.markdown('Sebastián Sarasti Zambonino')
32
+ st.markdown('Data Scientist - Machine Learning Engineer')
33
+ st.markdown('https://www.linkedin.com/in/sebastiansarasti/')
34
+ st.markdown('https://github.com/sebassaras02')
35
+
36
+ text_input = st.text_area('Enter the text to classify', height = 200)
37
+
38
+
39
+ result = None
40
+ if a and not xai_option:
41
+ if text_input:
42
+ with st.spinner('Classifying the text, wait please ...'):
43
+ time.sleep(1)
44
+ result = pipeline_inference(text_input)
45
+
46
+ st.subheader('Probability that the text was classified as:')
47
+ col1, col2 = st.columns(2)
48
+ with col1:
49
+ st.metric('Human written', result[0][0] )
50
+ with col2:
51
+ st.metric('AI written', result[0][1])
52
+ if result[0][1]>0.6:
53
+ st.warning('High probability that the text was written by an AI')
54
+ else:
55
+ st.success('High probability that the text was written by a human')
56
+ else:
57
+ st.exception('Please enter the text to classify, no text was provided')
58
+
59
+ elif a and xai_option:
60
+ if text_input:
61
+ with st.spinner('Classifying the text, wait please ...'):
62
+ time.sleep(1)
63
+ result = pipeline_inference(text_input)
64
+
65
+ st.subheader('Probability that the text was classified as:')
66
+ col1, col2 = st.columns(2)
67
+ with col1:
68
+ st.metric('Human written', result[0][0] )
69
+ with col2:
70
+ st.metric('AI written', result[0][1])
71
+ if result[0][1]>0.6:
72
+ st.warning('High probability that the text was written by an AI')
73
+ else:
74
+ st.success('High probability that the text was written by a human')
75
+
76
+ with st.spinner('Explaining the classification, wait please ...'):
77
+ explanation = get_explanation(text_input)
78
+ df = pd.DataFrame(list(explanation.items()), columns=['Palabras', 'Números'])
79
+ df['Signo'] = ['Positivo' if x >= 0 else 'Negativo' for x in df['Números']]
80
+ df = df.sort_values('Números', ascending=False)
81
+ df = df.rename(columns={'Palabras': 'Words', 'Números': 'Frequency', 'Signo': 'Type'})
82
+ df['Type'] = df['Type'].map({'Positivo': 'IA Pattern', 'Negativo': 'Humman Pattern'})
83
+ fig = px.bar(df, y='Words', x='Frequency', color='Type', color_discrete_map={'IA Pattern': 'red', 'Humman Pattern': 'blue'})
84
+ st.subheader('Explanation of the classification:')
85
+ st.markdown('The following words are the most important to classify the text:')
86
+ st.plotly_chart(fig)
87
+
88
+
89
+
src/app/pipelines.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import pandas as pd
3
+ import re
4
+ import mlflow
5
+ from joblib import dump, load
6
+ import sys
7
+
8
+ from utils.text_processing import TextProcessing
9
+
10
+
11
+ def pipeline_inference(input : str):
12
+ # load tf-idf model
13
+ tfidf_model = load('models/tfidf_model.joblib')
14
+ # load pca model
15
+ pca_model = load('models/pca_model.joblib')
16
+ # load the model
17
+ classifier_model = load('models/classifier_model.joblib')
18
+
19
+ # preprocess the input
20
+ text_processing = TextProcessing()
21
+ text_processed = text_processing.fit_transform_text(input)
22
+ vector = tfidf_model.transform([text_processed])
23
+ vector_pca = pca_model.transform(vector)
24
+ # make a vector with the pca values
25
+ df = pd.DataFrame(vector_pca, columns = ["dim1", "dim2", "dim3", "dim4", "dim5"])
26
+ # make the prediction
27
+ prediction = classifier_model.predict_proba(df)
28
+ return prediction
src/app/requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ numpy==1.26.4
2
+ pandas==2.2.0
3
+ pyarrow==15.0.0
4
+ fastparquet==2024.2.0
5
+ mlflow==2.10.2
6
+ nltk==3.8.1
7
+ seaborn==0.13.2
8
+ matplotlib==3.8.2
9
+ python-dotenv==1.0.1
10
+ plotly==5.19.0
11
+ lime==0.2.0.1
src/app/test.ipynb ADDED
@@ -0,0 +1,510 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 4,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "import mlflow\n",
10
+ "from dotenv import load_dotenv"
11
+ ]
12
+ },
13
+ {
14
+ "cell_type": "code",
15
+ "execution_count": 5,
16
+ "metadata": {},
17
+ "outputs": [
18
+ {
19
+ "data": {
20
+ "text/plain": [
21
+ "True"
22
+ ]
23
+ },
24
+ "execution_count": 5,
25
+ "metadata": {},
26
+ "output_type": "execute_result"
27
+ }
28
+ ],
29
+ "source": [
30
+ "load_dotenv('../../.env')"
31
+ ]
32
+ },
33
+ {
34
+ "cell_type": "code",
35
+ "execution_count": 6,
36
+ "metadata": {},
37
+ "outputs": [],
38
+ "source": [
39
+ "tfidf_logged_model = 'runs:/a63128b897bd4f91a06f20939a715b98/tfidf_model'"
40
+ ]
41
+ },
42
+ {
43
+ "cell_type": "code",
44
+ "execution_count": 7,
45
+ "metadata": {},
46
+ "outputs": [
47
+ {
48
+ "name": "stderr",
49
+ "output_type": "stream",
50
+ "text": [
51
+ "c:\\Users\\sebit\\.conda\\envs\\mlops\\Lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
52
+ " from .autonotebook import tqdm as notebook_tqdm\n",
53
+ "Downloading artifacts: 100%|██████████| 5/5 [00:02<00:00, 2.50it/s]\n"
54
+ ]
55
+ }
56
+ ],
57
+ "source": [
58
+ "tfidf_model = mlflow.sklearn.load_model(tfidf_logged_model)"
59
+ ]
60
+ },
61
+ {
62
+ "cell_type": "code",
63
+ "execution_count": 9,
64
+ "metadata": {},
65
+ "outputs": [
66
+ {
67
+ "data": {
68
+ "text/html": [
69
+ "<style>#sk-container-id-1 {\n",
70
+ " /* Definition of color scheme common for light and dark mode */\n",
71
+ " --sklearn-color-text: black;\n",
72
+ " --sklearn-color-line: gray;\n",
73
+ " /* Definition of color scheme for unfitted estimators */\n",
74
+ " --sklearn-color-unfitted-level-0: #fff5e6;\n",
75
+ " --sklearn-color-unfitted-level-1: #f6e4d2;\n",
76
+ " --sklearn-color-unfitted-level-2: #ffe0b3;\n",
77
+ " --sklearn-color-unfitted-level-3: chocolate;\n",
78
+ " /* Definition of color scheme for fitted estimators */\n",
79
+ " --sklearn-color-fitted-level-0: #f0f8ff;\n",
80
+ " --sklearn-color-fitted-level-1: #d4ebff;\n",
81
+ " --sklearn-color-fitted-level-2: #b3dbfd;\n",
82
+ " --sklearn-color-fitted-level-3: cornflowerblue;\n",
83
+ "\n",
84
+ " /* Specific color for light theme */\n",
85
+ " --sklearn-color-text-on-default-background: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, black)));\n",
86
+ " --sklearn-color-background: var(--sg-background-color, var(--theme-background, var(--jp-layout-color0, white)));\n",
87
+ " --sklearn-color-border-box: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, black)));\n",
88
+ " --sklearn-color-icon: #696969;\n",
89
+ "\n",
90
+ " @media (prefers-color-scheme: dark) {\n",
91
+ " /* Redefinition of color scheme for dark theme */\n",
92
+ " --sklearn-color-text-on-default-background: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, white)));\n",
93
+ " --sklearn-color-background: var(--sg-background-color, var(--theme-background, var(--jp-layout-color0, #111)));\n",
94
+ " --sklearn-color-border-box: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, white)));\n",
95
+ " --sklearn-color-icon: #878787;\n",
96
+ " }\n",
97
+ "}\n",
98
+ "\n",
99
+ "#sk-container-id-1 {\n",
100
+ " color: var(--sklearn-color-text);\n",
101
+ "}\n",
102
+ "\n",
103
+ "#sk-container-id-1 pre {\n",
104
+ " padding: 0;\n",
105
+ "}\n",
106
+ "\n",
107
+ "#sk-container-id-1 input.sk-hidden--visually {\n",
108
+ " border: 0;\n",
109
+ " clip: rect(1px 1px 1px 1px);\n",
110
+ " clip: rect(1px, 1px, 1px, 1px);\n",
111
+ " height: 1px;\n",
112
+ " margin: -1px;\n",
113
+ " overflow: hidden;\n",
114
+ " padding: 0;\n",
115
+ " position: absolute;\n",
116
+ " width: 1px;\n",
117
+ "}\n",
118
+ "\n",
119
+ "#sk-container-id-1 div.sk-dashed-wrapped {\n",
120
+ " border: 1px dashed var(--sklearn-color-line);\n",
121
+ " margin: 0 0.4em 0.5em 0.4em;\n",
122
+ " box-sizing: border-box;\n",
123
+ " padding-bottom: 0.4em;\n",
124
+ " background-color: var(--sklearn-color-background);\n",
125
+ "}\n",
126
+ "\n",
127
+ "#sk-container-id-1 div.sk-container {\n",
128
+ " /* jupyter's `normalize.less` sets `[hidden] { display: none; }`\n",
129
+ " but bootstrap.min.css set `[hidden] { display: none !important; }`\n",
130
+ " so we also need the `!important` here to be able to override the\n",
131
+ " default hidden behavior on the sphinx rendered scikit-learn.org.\n",
132
+ " See: https://github.com/scikit-learn/scikit-learn/issues/21755 */\n",
133
+ " display: inline-block !important;\n",
134
+ " position: relative;\n",
135
+ "}\n",
136
+ "\n",
137
+ "#sk-container-id-1 div.sk-text-repr-fallback {\n",
138
+ " display: none;\n",
139
+ "}\n",
140
+ "\n",
141
+ "div.sk-parallel-item,\n",
142
+ "div.sk-serial,\n",
143
+ "div.sk-item {\n",
144
+ " /* draw centered vertical line to link estimators */\n",
145
+ " background-image: linear-gradient(var(--sklearn-color-text-on-default-background), var(--sklearn-color-text-on-default-background));\n",
146
+ " background-size: 2px 100%;\n",
147
+ " background-repeat: no-repeat;\n",
148
+ " background-position: center center;\n",
149
+ "}\n",
150
+ "\n",
151
+ "/* Parallel-specific style estimator block */\n",
152
+ "\n",
153
+ "#sk-container-id-1 div.sk-parallel-item::after {\n",
154
+ " content: \"\";\n",
155
+ " width: 100%;\n",
156
+ " border-bottom: 2px solid var(--sklearn-color-text-on-default-background);\n",
157
+ " flex-grow: 1;\n",
158
+ "}\n",
159
+ "\n",
160
+ "#sk-container-id-1 div.sk-parallel {\n",
161
+ " display: flex;\n",
162
+ " align-items: stretch;\n",
163
+ " justify-content: center;\n",
164
+ " background-color: var(--sklearn-color-background);\n",
165
+ " position: relative;\n",
166
+ "}\n",
167
+ "\n",
168
+ "#sk-container-id-1 div.sk-parallel-item {\n",
169
+ " display: flex;\n",
170
+ " flex-direction: column;\n",
171
+ "}\n",
172
+ "\n",
173
+ "#sk-container-id-1 div.sk-parallel-item:first-child::after {\n",
174
+ " align-self: flex-end;\n",
175
+ " width: 50%;\n",
176
+ "}\n",
177
+ "\n",
178
+ "#sk-container-id-1 div.sk-parallel-item:last-child::after {\n",
179
+ " align-self: flex-start;\n",
180
+ " width: 50%;\n",
181
+ "}\n",
182
+ "\n",
183
+ "#sk-container-id-1 div.sk-parallel-item:only-child::after {\n",
184
+ " width: 0;\n",
185
+ "}\n",
186
+ "\n",
187
+ "/* Serial-specific style estimator block */\n",
188
+ "\n",
189
+ "#sk-container-id-1 div.sk-serial {\n",
190
+ " display: flex;\n",
191
+ " flex-direction: column;\n",
192
+ " align-items: center;\n",
193
+ " background-color: var(--sklearn-color-background);\n",
194
+ " padding-right: 1em;\n",
195
+ " padding-left: 1em;\n",
196
+ "}\n",
197
+ "\n",
198
+ "\n",
199
+ "/* Toggleable style: style used for estimator/Pipeline/ColumnTransformer box that is\n",
200
+ "clickable and can be expanded/collapsed.\n",
201
+ "- Pipeline and ColumnTransformer use this feature and define the default style\n",
202
+ "- Estimators will overwrite some part of the style using the `sk-estimator` class\n",
203
+ "*/\n",
204
+ "\n",
205
+ "/* Pipeline and ColumnTransformer style (default) */\n",
206
+ "\n",
207
+ "#sk-container-id-1 div.sk-toggleable {\n",
208
+ " /* Default theme specific background. It is overwritten whether we have a\n",
209
+ " specific estimator or a Pipeline/ColumnTransformer */\n",
210
+ " background-color: var(--sklearn-color-background);\n",
211
+ "}\n",
212
+ "\n",
213
+ "/* Toggleable label */\n",
214
+ "#sk-container-id-1 label.sk-toggleable__label {\n",
215
+ " cursor: pointer;\n",
216
+ " display: block;\n",
217
+ " width: 100%;\n",
218
+ " margin-bottom: 0;\n",
219
+ " padding: 0.5em;\n",
220
+ " box-sizing: border-box;\n",
221
+ " text-align: center;\n",
222
+ "}\n",
223
+ "\n",
224
+ "#sk-container-id-1 label.sk-toggleable__label-arrow:before {\n",
225
+ " /* Arrow on the left of the label */\n",
226
+ " content: \"▸\";\n",
227
+ " float: left;\n",
228
+ " margin-right: 0.25em;\n",
229
+ " color: var(--sklearn-color-icon);\n",
230
+ "}\n",
231
+ "\n",
232
+ "#sk-container-id-1 label.sk-toggleable__label-arrow:hover:before {\n",
233
+ " color: var(--sklearn-color-text);\n",
234
+ "}\n",
235
+ "\n",
236
+ "/* Toggleable content - dropdown */\n",
237
+ "\n",
238
+ "#sk-container-id-1 div.sk-toggleable__content {\n",
239
+ " max-height: 0;\n",
240
+ " max-width: 0;\n",
241
+ " overflow: hidden;\n",
242
+ " text-align: left;\n",
243
+ " /* unfitted */\n",
244
+ " background-color: var(--sklearn-color-unfitted-level-0);\n",
245
+ "}\n",
246
+ "\n",
247
+ "#sk-container-id-1 div.sk-toggleable__content.fitted {\n",
248
+ " /* fitted */\n",
249
+ " background-color: var(--sklearn-color-fitted-level-0);\n",
250
+ "}\n",
251
+ "\n",
252
+ "#sk-container-id-1 div.sk-toggleable__content pre {\n",
253
+ " margin: 0.2em;\n",
254
+ " border-radius: 0.25em;\n",
255
+ " color: var(--sklearn-color-text);\n",
256
+ " /* unfitted */\n",
257
+ " background-color: var(--sklearn-color-unfitted-level-0);\n",
258
+ "}\n",
259
+ "\n",
260
+ "#sk-container-id-1 div.sk-toggleable__content.fitted pre {\n",
261
+ " /* unfitted */\n",
262
+ " background-color: var(--sklearn-color-fitted-level-0);\n",
263
+ "}\n",
264
+ "\n",
265
+ "#sk-container-id-1 input.sk-toggleable__control:checked~div.sk-toggleable__content {\n",
266
+ " /* Expand drop-down */\n",
267
+ " max-height: 200px;\n",
268
+ " max-width: 100%;\n",
269
+ " overflow: auto;\n",
270
+ "}\n",
271
+ "\n",
272
+ "#sk-container-id-1 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {\n",
273
+ " content: \"▾\";\n",
274
+ "}\n",
275
+ "\n",
276
+ "/* Pipeline/ColumnTransformer-specific style */\n",
277
+ "\n",
278
+ "#sk-container-id-1 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
279
+ " color: var(--sklearn-color-text);\n",
280
+ " background-color: var(--sklearn-color-unfitted-level-2);\n",
281
+ "}\n",
282
+ "\n",
283
+ "#sk-container-id-1 div.sk-label.fitted input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
284
+ " background-color: var(--sklearn-color-fitted-level-2);\n",
285
+ "}\n",
286
+ "\n",
287
+ "/* Estimator-specific style */\n",
288
+ "\n",
289
+ "/* Colorize estimator box */\n",
290
+ "#sk-container-id-1 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
291
+ " /* unfitted */\n",
292
+ " background-color: var(--sklearn-color-unfitted-level-2);\n",
293
+ "}\n",
294
+ "\n",
295
+ "#sk-container-id-1 div.sk-estimator.fitted input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
296
+ " /* fitted */\n",
297
+ " background-color: var(--sklearn-color-fitted-level-2);\n",
298
+ "}\n",
299
+ "\n",
300
+ "#sk-container-id-1 div.sk-label label.sk-toggleable__label,\n",
301
+ "#sk-container-id-1 div.sk-label label {\n",
302
+ " /* The background is the default theme color */\n",
303
+ " color: var(--sklearn-color-text-on-default-background);\n",
304
+ "}\n",
305
+ "\n",
306
+ "/* On hover, darken the color of the background */\n",
307
+ "#sk-container-id-1 div.sk-label:hover label.sk-toggleable__label {\n",
308
+ " color: var(--sklearn-color-text);\n",
309
+ " background-color: var(--sklearn-color-unfitted-level-2);\n",
310
+ "}\n",
311
+ "\n",
312
+ "/* Label box, darken color on hover, fitted */\n",
313
+ "#sk-container-id-1 div.sk-label.fitted:hover label.sk-toggleable__label.fitted {\n",
314
+ " color: var(--sklearn-color-text);\n",
315
+ " background-color: var(--sklearn-color-fitted-level-2);\n",
316
+ "}\n",
317
+ "\n",
318
+ "/* Estimator label */\n",
319
+ "\n",
320
+ "#sk-container-id-1 div.sk-label label {\n",
321
+ " font-family: monospace;\n",
322
+ " font-weight: bold;\n",
323
+ " display: inline-block;\n",
324
+ " line-height: 1.2em;\n",
325
+ "}\n",
326
+ "\n",
327
+ "#sk-container-id-1 div.sk-label-container {\n",
328
+ " text-align: center;\n",
329
+ "}\n",
330
+ "\n",
331
+ "/* Estimator-specific */\n",
332
+ "#sk-container-id-1 div.sk-estimator {\n",
333
+ " font-family: monospace;\n",
334
+ " border: 1px dotted var(--sklearn-color-border-box);\n",
335
+ " border-radius: 0.25em;\n",
336
+ " box-sizing: border-box;\n",
337
+ " margin-bottom: 0.5em;\n",
338
+ " /* unfitted */\n",
339
+ " background-color: var(--sklearn-color-unfitted-level-0);\n",
340
+ "}\n",
341
+ "\n",
342
+ "#sk-container-id-1 div.sk-estimator.fitted {\n",
343
+ " /* fitted */\n",
344
+ " background-color: var(--sklearn-color-fitted-level-0);\n",
345
+ "}\n",
346
+ "\n",
347
+ "/* on hover */\n",
348
+ "#sk-container-id-1 div.sk-estimator:hover {\n",
349
+ " /* unfitted */\n",
350
+ " background-color: var(--sklearn-color-unfitted-level-2);\n",
351
+ "}\n",
352
+ "\n",
353
+ "#sk-container-id-1 div.sk-estimator.fitted:hover {\n",
354
+ " /* fitted */\n",
355
+ " background-color: var(--sklearn-color-fitted-level-2);\n",
356
+ "}\n",
357
+ "\n",
358
+ "/* Specification for estimator info (e.g. \"i\" and \"?\") */\n",
359
+ "\n",
360
+ "/* Common style for \"i\" and \"?\" */\n",
361
+ "\n",
362
+ ".sk-estimator-doc-link,\n",
363
+ "a:link.sk-estimator-doc-link,\n",
364
+ "a:visited.sk-estimator-doc-link {\n",
365
+ " float: right;\n",
366
+ " font-size: smaller;\n",
367
+ " line-height: 1em;\n",
368
+ " font-family: monospace;\n",
369
+ " background-color: var(--sklearn-color-background);\n",
370
+ " border-radius: 1em;\n",
371
+ " height: 1em;\n",
372
+ " width: 1em;\n",
373
+ " text-decoration: none !important;\n",
374
+ " margin-left: 1ex;\n",
375
+ " /* unfitted */\n",
376
+ " border: var(--sklearn-color-unfitted-level-1) 1pt solid;\n",
377
+ " color: var(--sklearn-color-unfitted-level-1);\n",
378
+ "}\n",
379
+ "\n",
380
+ ".sk-estimator-doc-link.fitted,\n",
381
+ "a:link.sk-estimator-doc-link.fitted,\n",
382
+ "a:visited.sk-estimator-doc-link.fitted {\n",
383
+ " /* fitted */\n",
384
+ " border: var(--sklearn-color-fitted-level-1) 1pt solid;\n",
385
+ " color: var(--sklearn-color-fitted-level-1);\n",
386
+ "}\n",
387
+ "\n",
388
+ "/* On hover */\n",
389
+ "div.sk-estimator:hover .sk-estimator-doc-link:hover,\n",
390
+ ".sk-estimator-doc-link:hover,\n",
391
+ "div.sk-label-container:hover .sk-estimator-doc-link:hover,\n",
392
+ ".sk-estimator-doc-link:hover {\n",
393
+ " /* unfitted */\n",
394
+ " background-color: var(--sklearn-color-unfitted-level-3);\n",
395
+ " color: var(--sklearn-color-background);\n",
396
+ " text-decoration: none;\n",
397
+ "}\n",
398
+ "\n",
399
+ "div.sk-estimator.fitted:hover .sk-estimator-doc-link.fitted:hover,\n",
400
+ ".sk-estimator-doc-link.fitted:hover,\n",
401
+ "div.sk-label-container:hover .sk-estimator-doc-link.fitted:hover,\n",
402
+ ".sk-estimator-doc-link.fitted:hover {\n",
403
+ " /* fitted */\n",
404
+ " background-color: var(--sklearn-color-fitted-level-3);\n",
405
+ " color: var(--sklearn-color-background);\n",
406
+ " text-decoration: none;\n",
407
+ "}\n",
408
+ "\n",
409
+ "/* Span, style for the box shown on hovering the info icon */\n",
410
+ ".sk-estimator-doc-link span {\n",
411
+ " display: none;\n",
412
+ " z-index: 9999;\n",
413
+ " position: relative;\n",
414
+ " font-weight: normal;\n",
415
+ " right: .2ex;\n",
416
+ " padding: .5ex;\n",
417
+ " margin: .5ex;\n",
418
+ " width: min-content;\n",
419
+ " min-width: 20ex;\n",
420
+ " max-width: 50ex;\n",
421
+ " color: var(--sklearn-color-text);\n",
422
+ " box-shadow: 2pt 2pt 4pt #999;\n",
423
+ " /* unfitted */\n",
424
+ " background: var(--sklearn-color-unfitted-level-0);\n",
425
+ " border: .5pt solid var(--sklearn-color-unfitted-level-3);\n",
426
+ "}\n",
427
+ "\n",
428
+ ".sk-estimator-doc-link.fitted span {\n",
429
+ " /* fitted */\n",
430
+ " background: var(--sklearn-color-fitted-level-0);\n",
431
+ " border: var(--sklearn-color-fitted-level-3);\n",
432
+ "}\n",
433
+ "\n",
434
+ ".sk-estimator-doc-link:hover span {\n",
435
+ " display: block;\n",
436
+ "}\n",
437
+ "\n",
438
+ "/* \"?\"-specific style due to the `<a>` HTML tag */\n",
439
+ "\n",
440
+ "#sk-container-id-1 a.estimator_doc_link {\n",
441
+ " float: right;\n",
442
+ " font-size: 1rem;\n",
443
+ " line-height: 1em;\n",
444
+ " font-family: monospace;\n",
445
+ " background-color: var(--sklearn-color-background);\n",
446
+ " border-radius: 1rem;\n",
447
+ " height: 1rem;\n",
448
+ " width: 1rem;\n",
449
+ " text-decoration: none;\n",
450
+ " /* unfitted */\n",
451
+ " color: var(--sklearn-color-unfitted-level-1);\n",
452
+ " border: var(--sklearn-color-unfitted-level-1) 1pt solid;\n",
453
+ "}\n",
454
+ "\n",
455
+ "#sk-container-id-1 a.estimator_doc_link.fitted {\n",
456
+ " /* fitted */\n",
457
+ " border: var(--sklearn-color-fitted-level-1) 1pt solid;\n",
458
+ " color: var(--sklearn-color-fitted-level-1);\n",
459
+ "}\n",
460
+ "\n",
461
+ "/* On hover */\n",
462
+ "#sk-container-id-1 a.estimator_doc_link:hover {\n",
463
+ " /* unfitted */\n",
464
+ " background-color: var(--sklearn-color-unfitted-level-3);\n",
465
+ " color: var(--sklearn-color-background);\n",
466
+ " text-decoration: none;\n",
467
+ "}\n",
468
+ "\n",
469
+ "#sk-container-id-1 a.estimator_doc_link.fitted:hover {\n",
470
+ " /* fitted */\n",
471
+ " background-color: var(--sklearn-color-fitted-level-3);\n",
472
+ "}\n",
473
+ "</style><div id=\"sk-container-id-1\" class=\"sk-top-container\"><div class=\"sk-text-repr-fallback\"><pre>TfidfVectorizer(max_df=0.95, max_features=2000, min_df=0.1)</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class=\"sk-container\" hidden><div class=\"sk-item\"><div class=\"sk-estimator fitted sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-1\" type=\"checkbox\" checked><label for=\"sk-estimator-id-1\" class=\"sk-toggleable__label fitted sk-toggleable__label-arrow fitted\">&nbsp;&nbsp;TfidfVectorizer<a class=\"sk-estimator-doc-link fitted\" rel=\"noreferrer\" target=\"_blank\" href=\"https://scikit-learn.org/1.4/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html\">?<span>Documentation for TfidfVectorizer</span></a><span class=\"sk-estimator-doc-link fitted\">i<span>Fitted</span></span></label><div class=\"sk-toggleable__content fitted\"><pre>TfidfVectorizer(max_df=0.95, max_features=2000, min_df=0.1)</pre></div> </div></div></div></div>"
474
+ ],
475
+ "text/plain": [
476
+ "TfidfVectorizer(max_df=0.95, max_features=2000, min_df=0.1)"
477
+ ]
478
+ },
479
+ "execution_count": 9,
480
+ "metadata": {},
481
+ "output_type": "execute_result"
482
+ }
483
+ ],
484
+ "source": [
485
+ "tfidf_model"
486
+ ]
487
+ }
488
+ ],
489
+ "metadata": {
490
+ "kernelspec": {
491
+ "display_name": "mlops",
492
+ "language": "python",
493
+ "name": "python3"
494
+ },
495
+ "language_info": {
496
+ "codemirror_mode": {
497
+ "name": "ipython",
498
+ "version": 3
499
+ },
500
+ "file_extension": ".py",
501
+ "mimetype": "text/x-python",
502
+ "name": "python",
503
+ "nbconvert_exporter": "python",
504
+ "pygments_lexer": "ipython3",
505
+ "version": "3.11.7"
506
+ }
507
+ },
508
+ "nbformat": 4,
509
+ "nbformat_minor": 2
510
+ }
src/app/utils/__init__.py ADDED
File without changes
src/app/utils/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (167 Bytes). View file
 
src/app/utils/__pycache__/log_model.cpython-311.pyc ADDED
Binary file (4.4 kB). View file
 
src/app/utils/__pycache__/text_features.cpython-311.pyc ADDED
Binary file (4.43 kB). View file
 
src/app/utils/__pycache__/text_processing.cpython-311.pyc ADDED
Binary file (9.83 kB). View file
 
src/app/utils/download_model.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def pipeline_download_models():
2
+ """
3
+ This function downloads the models from the mlflow server and saves them in the models folder
4
+
5
+ Args:
6
+ None
7
+
8
+ Returns:
9
+ None
10
+ """
11
+ load_dotenv('../../.env')
12
+ # download the tf-idf model
13
+ tfidf_logged_model = 'runs:/a63128b897bd4f91a06f20939a715b98/tfidf_model'
14
+ tfidf_model = mlflow.sklearn.load_model(tfidf_logged_model)
15
+ dump(tfidf_model, '../../models/tfidf_model.joblib')
16
+ # download the pca model
17
+ pca_logged_model = 'runs:/a63128b897bd4f91a06f20939a715b98/pca_model'
18
+ pca_model = mlflow.sklearn.load_model(pca_logged_model)
19
+ dump(pca_model, '../../models/pca_model.joblib')
20
+ # download the classifier
21
+ classifier_logged_model = 'runs:/49483b7a0f95430a8492a448ac13e8d7/random-forest'
22
+ classifier_model = mlflow.sklearn.load_model(classifier_logged_model)
23
+ dump(classifier_model, '../../models/classifier_model.joblib')
src/app/utils/log_model.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import mlflow
2
+ from datetime import datetime
3
+ from sklearn.metrics import classification_report
4
+
5
+ class LogModel:
6
+
7
+ def __init__(self, mlflow_uri : str, mlflow_experiment_name : str, mlflow_run_name : str, X_train, Y_train, X_test, Y_test, model, model_name) -> None:
8
+ self.mlflow_uri = mlflow_uri
9
+ self.mlflow_experiment_name = mlflow_experiment_name
10
+ self.mlflow_run_name = mlflow_run_name
11
+ self.X_train = X_train
12
+ self.Y_train = Y_train
13
+ self.X_test = X_test
14
+ self.Y_test = Y_test
15
+ self.model_name = model_name
16
+ self.model = model
17
+ # set the mlflow uri
18
+ mlflow.set_tracking_uri(self.mlflow_uri)
19
+ mlflow.set_experiment(self.mlflow_experiment_name)
20
+
21
+ def evaluate_train_data(self):
22
+ """
23
+ This function evaluates the model on the training data
24
+ """
25
+ self.report1 = classification_report(self.Y_test, self.model.predict(self.X_test), output_dict=True)
26
+ mlflow.log_metric("accuracy", self.report1.pop("accuracy"))
27
+ for class_or_avg, metrics_dict in self.report1.items():
28
+ for metric, value in metrics_dict.items():
29
+ mlflow.log_metric(class_or_avg + '_' + metric,value)
30
+
31
+ def evaluate_test_data(self):
32
+ """
33
+ This function evaluates the model on the test data
34
+ """
35
+ self.report2 = classification_report(self.Y_test, self.model.predict(self.X_test), output_dict=True)
36
+ mlflow.log_metric("accuracy", self.report2.pop("accuracy"))
37
+ for class_or_avg, metrics_dict in self.report2.items():
38
+ for metric, value in metrics_dict.items():
39
+ mlflow.log_metric(class_or_avg + '_' + metric,value)
40
+
41
+ def register_model(self):
42
+ """
43
+ This function register the model created parameters and the model
44
+ """
45
+ params = self.model.get_params()
46
+ mlflow.log_params(params)
47
+ mlflow.sklearn.log_model(self.model, self.model_name)
48
+
49
+ def fit_transform(self):
50
+ with mlflow.start_run(run_name = self.mlflow_run_name + " " + datetime.today().strftime("%Y-%m-%d %H:%M:%S")):
51
+ self.evaluate_train_data()
52
+ self.evaluate_test_data()
53
+ self.register_model()
54
+ mlflow.end_run()
55
+ print("Model performance over the test dataset")
56
+ print(self.report2)
src/app/utils/text_features.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sklearn.feature_extraction.text import TfidfVectorizer
2
+ import pandas as pd
3
+ from joblib import dump
4
+ import numpy as np
5
+ from sklearn.decomposition import PCA
6
+ import mlflow
7
+ from datetime import datetime
8
+
9
+ class FeatureTextExtraction:
10
+
11
+ def __init__(self, mlflow_uri : str, mlflow_experiment_name : str, mlflow_run_name : str) -> None:
12
+ self.vectorizer = TfidfVectorizer(max_df=0.95, min_df=0.1, max_features=2000)
13
+ self.pca = PCA(5, random_state=99)
14
+ self.mlflow_uri = mlflow_uri
15
+ self.mlflow_experiment_name = mlflow_experiment_name
16
+ self.mlflow_run_name = mlflow_run_name
17
+ # set the mlflow uri
18
+ mlflow.set_tracking_uri(self.mlflow_uri)
19
+ mlflow.set_experiment(self.mlflow_experiment_name)
20
+
21
+ def fit_tfidf(self, df: pd.DataFrame) -> None:
22
+ """
23
+ This function fits the model to the data
24
+
25
+ Args:
26
+ df: pd.DataFrame: The dataframe containing the data
27
+
28
+ Returns:
29
+ None
30
+ """
31
+ self.df = df
32
+ self.df = self.df.dropna(subset=["processed_text"])
33
+ self.matrix = self.vectorizer.fit_transform(df["processed_text"])
34
+
35
+ def dimesion_reduction(self) -> pd.DataFrame:
36
+ """
37
+ This function reduces the dimension of the data
38
+
39
+ Returns:
40
+ pd.DataFrame: The dataframe containing the transformed data
41
+ """
42
+ self.reduced_data = self.pca.fit_transform(self.matrix.toarray())
43
+ # convert to dataframe
44
+ self.reduced_df = pd.DataFrame(self.reduced_data, columns=["dim1", "dim2", "dim3", "dim4", "dim5"])
45
+ return self.reduced_df
46
+
47
+ def fit_transform(self, df : pd.DataFrame) -> pd.DataFrame:
48
+ """
49
+ This function fits the model to the data
50
+
51
+ Args:
52
+ df: pd.DataFrame: The dataframe containing the data
53
+
54
+ Returns:
55
+ pd.DataFrame: The dataframe containing the transformed data
56
+ """
57
+ with mlflow.start_run(run_name = self.mlflow_run_name + " " + datetime.today().strftime("%Y-%m-%d %H:%M:%S")):
58
+ # log the parameters of the TF-IDF model
59
+ self.fit_tfidf(df)
60
+ # log the model of the TF-IDF model
61
+ mlflow.sklearn.log_model(self.vectorizer, "tfidf_model")
62
+ # log the parameters of the PCA model
63
+ self.data = self.dimesion_reduction()
64
+ # log the model of the PCA model
65
+ mlflow.sklearn.log_model(self.pca, "pca_model")
66
+ # end the run
67
+ mlflow.end_run()
68
+ # delete the parameters
69
+ self.final_df = pd.concat([self.df, self.data], axis=1)
70
+ return self.final_df
src/app/utils/text_processing.py ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from nltk.corpus import stopwords
2
+ from nltk.stem import WordNetLemmatizer
3
+ import pandas as pd
4
+ from nltk.stem import PorterStemmer
5
+ import re
6
+
7
+
8
+ class TextProcessing:
9
+ """
10
+ This class contains all methods to process text data.
11
+ """
12
+ def __init__(self, language : str = 'english'):
13
+ self.list_stopwords = list(set(stopwords.words(language)))
14
+ self.lemmatizer = WordNetLemmatizer()
15
+ self.stemmer = PorterStemmer()
16
+
17
+ def tokenize(self, text : str) -> list:
18
+ """
19
+ This function takes a string and returns a list of words in the string.
20
+
21
+ Args:
22
+ text : A string of words
23
+
24
+ Returns:
25
+ the tokens
26
+ """
27
+ return text.split()
28
+
29
+ def remove_stopwords(self, list_tokens : list) -> list:
30
+ """
31
+ This function removes the stopwords from the list of tokens.
32
+
33
+ Args:
34
+ list_tokens : list of tokens to process
35
+
36
+ Returns:
37
+ list of tokens with the stopwords removed
38
+ """
39
+ return [word for word in list_tokens if word not in self.list_stopwords]
40
+
41
+ def lemmatize_tokens(self, list_tokens : list) -> list:
42
+ """
43
+ This function lemmatizes a list of tokens.
44
+
45
+ Args:
46
+ list_tokens : list of tokens
47
+ lemmatizer : instance of WordNetLemmatizer
48
+
49
+ Returns:
50
+ list of lemmatized tokens
51
+ """
52
+ return [self.lemmatizer.lemmatize(word) for word in list_tokens]
53
+
54
+ def steem_tokens(self, list_tokens : list) -> list:
55
+ """
56
+ This function steems a list of tokens.
57
+
58
+ Args:
59
+ list_tokens : list of tokens
60
+
61
+ Returns:
62
+ list of steemed tokens
63
+ """
64
+ return [self.stemmer.stem(word) for word in list_tokens]
65
+
66
+
67
+ def lowercase_tokens(self, list_tokens : list) -> list:
68
+ """"
69
+ This function receives a list of tokens and returns a list of tokens in lowercase
70
+
71
+ Args:
72
+ list_tokens: list of strings
73
+
74
+ Returns:
75
+ list of strings
76
+ """
77
+ return [word.lower() for word in list_tokens]
78
+
79
+ def remove_short_tokens(self, token_list : list, min_length : int = 3) -> list:
80
+ """
81
+ This function removes words from a list of tokens that are shorter than min_length.
82
+
83
+ Args:
84
+ token_list: list of strings
85
+ min_length: int, minimum length of the words to keep
86
+
87
+ Returns:
88
+ list of strings
89
+ """
90
+ return [word for word in token_list if len(word) >= min_length]
91
+
92
+ def remove_punctuation(self, text : str) -> str:
93
+ """
94
+ This function removes punctuation from a list of tokens.
95
+
96
+ Args:
97
+ token_list: list of strings
98
+
99
+ Returns:
100
+ list of strings
101
+ """
102
+ if isinstance(text, bytes):
103
+ text = text.decode('utf-8') # Decodificar si es una cadena de bytes
104
+ text = re.sub(r'[^\w\s]', '', text)
105
+ text = re.sub(r'\n', '', text)
106
+ text = re.sub(r'\d', '', text)
107
+ return text
108
+
109
+ def join_tokens_cleaned(self, token_list : list ) -> list:
110
+ """
111
+ This function joins the tokens in a list
112
+
113
+ Args:
114
+ token_list : list of tokens cleaned
115
+
116
+ Returns:
117
+ text : final phrase
118
+ """
119
+ return " ".join(token_list)
120
+
121
+ def fit_transform(self, df : pd.DataFrame) -> pd.DataFrame:
122
+ """
123
+ This function receives a dataframe and applies the text processing methods to the text column.
124
+
125
+ Args:
126
+ df : pandas DataFrame with a column named 'text'
127
+
128
+ Returns:
129
+ df : pandas DataFrame with a column named 'processed_text'
130
+ """
131
+ df['text'] = df['text'].apply(lambda x: self.remove_punctuation(x))
132
+ df['processed_text'] = df['text'].apply(lambda x: self.tokenize(x))
133
+ df['processed_text'] = df['processed_text'].apply(lambda x: self.lowercase_tokens(x))
134
+ df['processed_text'] = df['processed_text'].apply(lambda x: self.remove_stopwords(x))
135
+ df['processed_text'] = df['processed_text'].apply(lambda x: self.remove_short_tokens(x))
136
+ df['processed_text'] = df['processed_text'].apply(lambda x: self.steem_tokens(x))
137
+ df['processed_text'] = df['processed_text'].apply(lambda x: self.join_tokens_cleaned(x))
138
+
139
+ return df
140
+
141
+ def fit_transform_text(self, text):
142
+ """
143
+ This function receives a string and applies the text processing methods to it.
144
+
145
+ Args:
146
+ text : list with raw texts
147
+
148
+ Returns:
149
+ text : list with curated texts
150
+ """
151
+ text = self.remove_punctuation(text)
152
+ text = self.tokenize(text)
153
+ text = self.lowercase_tokens(text)
154
+ text = self.remove_stopwords(text)
155
+ text = self.remove_short_tokens(text)
156
+ text = self.steem_tokens(text)
157
+ text = self.join_tokens_cleaned(text)
158
+ return text
159
+
160
+
src/app/xai.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import pandas as pd
3
+ import sys
4
+ from lime.lime_text import LimeTextExplainer
5
+
6
+
7
+ from pipelines import pipeline_inference
8
+
9
+ def f(x):
10
+ results = np.zeros((len(x), 2)) # Asumiendo que num_classes es la cantidad de clases en tu problema
11
+ for i, element in enumerate(x):
12
+ predictions = pipeline_inference(element)
13
+ results[i, :] = predictions
14
+ return results
15
+
16
+
17
+ def get_explanation(text):
18
+ explainer = LimeTextExplainer(class_names=["Human", "AI"])
19
+ explanation = explainer.explain_instance(
20
+ text_instance = text,
21
+ classifier_fn = f,
22
+ num_features=30,
23
+ num_samples = 10
24
+ )
25
+ a = explanation.as_list()
26
+ result = {element[0]: element[1] for element in a}
27
+ return result