Upload 6 files
Browse files- .env +2 -0
- Dockerfile +12 -0
- app.py +286 -0
- helpers.py +146 -0
- lancedb_client.py +100 -0
- requirements.txt +18 -0
.env
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
_DEFAULT_PARSE_METADATA=('dcterms:identifier','dcterms:type','dcterms:title', 'dcterms:description','dcterms:creator','dcterms:publisher','dcterms:date','dcterms:spatial','dcterms:format','dcterms:provenance','dcterms:subject','dcterms:medium','bibo:annotates','bibo:content', 'bibo:locator', 'bibo:owner')
|
2 |
+
HF_TOKEN=""
|
Dockerfile
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# syntax=docker/dockerfile:1
|
2 |
+
FROM python:3.11-slim
|
3 |
+
RUN useradd -m -u 1000 user
|
4 |
+
USER user
|
5 |
+
ENV PATH="/home/user/.local/bin:$PATH"
|
6 |
+
WORKDIR /app
|
7 |
+
COPY --chown=user ./requirements.txt requirements.txt
|
8 |
+
RUN pip install --upgrade pip
|
9 |
+
RUN pip install --no-cache-dir --upgrade -r requirements.txt
|
10 |
+
COPY --chown=user . /app
|
11 |
+
EXPOSE 8050
|
12 |
+
CMD gunicorn --workers 5 --threads 2 -b 0.0.0.0:8050 --timeout 0 app:server
|
app.py
ADDED
@@ -0,0 +1,286 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import dash
|
2 |
+
from dash import dcc, html, Input, Output, State, ctx
|
3 |
+
import dash_bootstrap_components as dbc
|
4 |
+
import plotly.express as px
|
5 |
+
import pandas as pd
|
6 |
+
import numpy as np
|
7 |
+
import umap
|
8 |
+
import hdbscan
|
9 |
+
import sklearn.feature_extraction.text as text
|
10 |
+
from dash.exceptions import PreventUpdate
|
11 |
+
import os
|
12 |
+
from dotenv import load_dotenv
|
13 |
+
import helpers
|
14 |
+
import lancedb
|
15 |
+
from omeka_s_api_client import OmekaSClient, OmekaSClientError
|
16 |
+
from lancedb_client import LanceDBManager
|
17 |
+
|
18 |
+
# Load .env for credentials
|
19 |
+
load_dotenv()
|
20 |
+
_DEFAULT_PARSE_METADATA = (
|
21 |
+
'dcterms:identifier','dcterms:type','dcterms:title', 'dcterms:description',
|
22 |
+
'dcterms:creator','dcterms:publisher','dcterms:date','dcterms:spatial',
|
23 |
+
'dcterms:format','dcterms:provenance','dcterms:subject','dcterms:medium',
|
24 |
+
'bibo:annotates','bibo:content', 'bibo:locator', 'bibo:owner'
|
25 |
+
)
|
26 |
+
|
27 |
+
app = dash.Dash(__name__, external_stylesheets=[dbc.themes.BOOTSTRAP])
|
28 |
+
app.config.suppress_callback_exceptions = True
|
29 |
+
server = app.server
|
30 |
+
manager = LanceDBManager()
|
31 |
+
|
32 |
+
french_stopwords = text.ENGLISH_STOP_WORDS.union([
|
33 |
+
"alors", "au", "aucuns", "aussi", "autre", "avant", "avec", "avoir", "bon",
|
34 |
+
"car", "ce", "cela", "ces", "ceux", "chaque", "ci", "comme", "comment", "dans",
|
35 |
+
"des", "du", "dedans", "dehors", "depuis", "devrait", "doit", "donc", "dos",
|
36 |
+
"début", "elle", "elles", "en", "encore", "essai", "est", "et", "eu", "fait",
|
37 |
+
"faites", "fois", "font", "hors", "ici", "il", "ils", "je", "juste", "la", "le",
|
38 |
+
"les", "leur", "là", "ma", "maintenant", "mais", "mes", "mine", "moins", "mon",
|
39 |
+
"mot", "même", "ni", "nommés", "notre", "nous", "nouveaux", "ou", "où", "par",
|
40 |
+
"parce", "parole", "pas", "personnes", "peut", "peu", "pièce", "plupart", "pour",
|
41 |
+
"pourquoi", "quand", "que", "quel", "quelle", "quelles", "quels", "qui", "sa",
|
42 |
+
"sans", "ses", "seulement", "si", "sien", "son", "sont", "sous", "soyez", "sujet",
|
43 |
+
"sur", "ta", "tandis", "tellement", "tels", "tes", "ton", "tous", "tout", "trop",
|
44 |
+
"très", "tu", "valeur", "voie", "voient", "vont", "votre", "vous", "vu", "ça",
|
45 |
+
"étaient", "état", "étions", "été", "être"
|
46 |
+
])
|
47 |
+
|
48 |
+
# -------------------- Layout --------------------
|
49 |
+
app.layout = dbc.Container([
|
50 |
+
html.H2("🌍 Omeka S UMAP Explorer", className="text-center mt-4"),
|
51 |
+
html.Hr(),
|
52 |
+
|
53 |
+
# Input controls
|
54 |
+
dbc.Row([
|
55 |
+
dbc.Col([
|
56 |
+
html.H5("🔍 From Omeka S"),
|
57 |
+
dcc.Input(id="api-url", value="https://your-omeka-instance.org", type="text", className="form-control"),
|
58 |
+
dbc.Button("Load Item Sets", id="load-sets", color="secondary", className="mt-2"),
|
59 |
+
dcc.Dropdown(id="items-sets-dropdown", placeholder="Select a collection"),
|
60 |
+
dcc.Input(id="table-name", value="my_table", type="text", className="form-control mt-2", placeholder="New table name"),
|
61 |
+
dbc.Button("Process Omeka Collection", id="load-data", color="primary", className="mt-2"),
|
62 |
+
], md=4),
|
63 |
+
|
64 |
+
dbc.Col([
|
65 |
+
html.H5("📁 From LanceDB"),
|
66 |
+
dbc.Button("Load Existing Tables", id="load-tables", color="info"),
|
67 |
+
dcc.Dropdown(id="db-tables-dropdown", placeholder="Select an existing table"),
|
68 |
+
dbc.Button("Display Table", id="load-data-db", color="success", className="mt-2"),
|
69 |
+
], md=4),
|
70 |
+
|
71 |
+
dbc.Col([
|
72 |
+
html.H5("🔎 Query Tool (coming soon)"),
|
73 |
+
dbc.Input(placeholder="Type a search query...", type="text", disabled=True),
|
74 |
+
], md=4),
|
75 |
+
], className="mb-4"),
|
76 |
+
|
77 |
+
# Main plot area and metadata side panel
|
78 |
+
dbc.Row([
|
79 |
+
dbc.Col(
|
80 |
+
dcc.Graph(id="umap-graph", style={"height": "700px"}),
|
81 |
+
md=8
|
82 |
+
),
|
83 |
+
dbc.Col(
|
84 |
+
html.Div(id="point-details", style={
|
85 |
+
"padding": "15px",
|
86 |
+
"borderLeft": "1px solid #ccc",
|
87 |
+
"height": "700px",
|
88 |
+
"overflowY": "auto"
|
89 |
+
}),
|
90 |
+
md=4
|
91 |
+
),
|
92 |
+
]),
|
93 |
+
|
94 |
+
# Status/info
|
95 |
+
html.Div(id="status", className="mt-3"),
|
96 |
+
|
97 |
+
dcc.Store(id="omeka-client-config", storage_type="session")
|
98 |
+
], fluid=True)
|
99 |
+
|
100 |
+
# -------------------- Callbacks --------------------
|
101 |
+
|
102 |
+
@app.callback(
|
103 |
+
Output("items-sets-dropdown", "options"),
|
104 |
+
Output("omeka-client-config", "data"),
|
105 |
+
Input("load-sets", "n_clicks"),
|
106 |
+
State("api-url", "value"),
|
107 |
+
prevent_initial_call=True
|
108 |
+
)
|
109 |
+
def load_item_sets(n, base_url):
|
110 |
+
client = OmekaSClient(base_url, "...", "...", 50)
|
111 |
+
try:
|
112 |
+
item_sets = client.list_all_item_sets()
|
113 |
+
options = [{"label": s.get('dcterms:title', [{}])[0].get('@value', 'N/A'), "value": s["o:id"]} for s in item_sets]
|
114 |
+
return options, {
|
115 |
+
"base_url": base_url,
|
116 |
+
"key_identity": "...",
|
117 |
+
"key_credential": "...",
|
118 |
+
"default_per_page": 50
|
119 |
+
}
|
120 |
+
except Exception as e:
|
121 |
+
return dash.no_update, dash.no_update
|
122 |
+
|
123 |
+
@app.callback(
|
124 |
+
Output("db-tables-dropdown", "options"),
|
125 |
+
Input("load-tables", "n_clicks"),
|
126 |
+
prevent_initial_call=True
|
127 |
+
)
|
128 |
+
def list_tables(n):
|
129 |
+
return [{"label": t, "value": t} for t in manager.list_tables()]
|
130 |
+
|
131 |
+
@app.callback(
|
132 |
+
Output("umap-graph", "figure"),
|
133 |
+
Output("status", "children"),
|
134 |
+
Input("load-data", "n_clicks"), # From Omeka S
|
135 |
+
Input("load-data-db", "n_clicks"), # From DB table
|
136 |
+
State("items-sets-dropdown", "value"),
|
137 |
+
State("omeka-client-config", "data"),
|
138 |
+
State("table-name", "value"),
|
139 |
+
State("db-tables-dropdown", "value"),
|
140 |
+
prevent_initial_call=True
|
141 |
+
)
|
142 |
+
def handle_data_loading(n_clicks_omeka, n_clicks_db, item_set_id, client_config, table_name, db_table):
|
143 |
+
triggered_id = ctx.triggered_id
|
144 |
+
print(triggered_id)
|
145 |
+
|
146 |
+
if triggered_id == "load-data": # Omeka S case
|
147 |
+
if not client_config:
|
148 |
+
raise PreventUpdate
|
149 |
+
|
150 |
+
client = OmekaSClient(
|
151 |
+
base_url=client_config["base_url"],
|
152 |
+
key_identity=client_config["key_identity"],
|
153 |
+
key_credential=client_config["key_credential"]
|
154 |
+
)
|
155 |
+
|
156 |
+
df_omeka = harvest_omeka_items(client, item_set_id=item_set_id)
|
157 |
+
items = df_omeka.to_dict(orient="records")
|
158 |
+
records_with_text = [helpers.add_concatenated_text_field_exclude_keys(item, keys_to_exclude=['id','images_urls'], text_field_key='text', pair_separator=' - ') for item in items]
|
159 |
+
df = helpers.prepare_df_atlas(pd.DataFrame(records_with_text), id_col='id', images_col='images_urls')
|
160 |
+
|
161 |
+
text_embed = helpers.generate_text_embed(df['text'].tolist())
|
162 |
+
img_embed = helpers.generate_img_embed(df['images_urls'].tolist())
|
163 |
+
embeddings = np.concatenate([text_embed, img_embed], axis=1)
|
164 |
+
df["embeddings"] = embeddings.tolist()
|
165 |
+
|
166 |
+
reducer = umap.UMAP(n_neighbors=15, min_dist=0.1, metric="cosine")
|
167 |
+
umap_embeddings = reducer.fit_transform(embeddings)
|
168 |
+
df["umap_embeddings"] = umap_embeddings.tolist()
|
169 |
+
|
170 |
+
clusterer = hdbscan.HDBSCAN(min_cluster_size=10)
|
171 |
+
cluster_labels = clusterer.fit_predict(umap_embeddings)
|
172 |
+
df["Cluster"] = cluster_labels
|
173 |
+
|
174 |
+
vectorizer = text.TfidfVectorizer(max_features=1000, stop_words=list(french_stopwords), lowercase=True)
|
175 |
+
tfidf_matrix = vectorizer.fit_transform(df["text"].astype(str).tolist())
|
176 |
+
top_words = []
|
177 |
+
for label in sorted(df["Cluster"].unique()):
|
178 |
+
if label == -1:
|
179 |
+
top_words.append("Noise")
|
180 |
+
continue
|
181 |
+
mask = (df["Cluster"] == label).to_numpy().nonzero()[0]
|
182 |
+
cluster_docs = tfidf_matrix[mask]
|
183 |
+
mean_tfidf = cluster_docs.mean(axis=0)
|
184 |
+
mean_tfidf = np.asarray(mean_tfidf).flatten()
|
185 |
+
top_indices = mean_tfidf.argsort()[::-1][:5]
|
186 |
+
terms = [vectorizer.get_feature_names_out()[i] for i in top_indices]
|
187 |
+
top_words.append(", ".join(terms))
|
188 |
+
cluster_name_map = {label: name for label, name in zip(sorted(df["Cluster"].unique()), top_words)}
|
189 |
+
df["Topic"] = df["Cluster"].map(cluster_name_map)
|
190 |
+
|
191 |
+
manager.initialize_table(table_name)
|
192 |
+
manager.add_entry(table_name, df.to_dict(orient="records"))
|
193 |
+
|
194 |
+
elif triggered_id == "load-data-db": # Load existing LanceDB table
|
195 |
+
if not db_table:
|
196 |
+
raise PreventUpdate
|
197 |
+
items = manager.get_content_table(db_table)
|
198 |
+
df = pd.DataFrame(items)
|
199 |
+
df = df.dropna(axis=1, how='all')
|
200 |
+
df = df.fillna('')
|
201 |
+
#umap_embeddings = np.array(df["umap_embeddings"].tolist())
|
202 |
+
|
203 |
+
else:
|
204 |
+
raise PreventUpdate
|
205 |
+
|
206 |
+
# Plotting
|
207 |
+
return create_umap_plot(df)
|
208 |
+
|
209 |
+
|
210 |
+
@app.callback(
|
211 |
+
Output("point-details", "children"),
|
212 |
+
Input("umap-graph", "clickData")
|
213 |
+
)
|
214 |
+
def show_point_details(clickData):
|
215 |
+
if not clickData:
|
216 |
+
return html.Div("🖱️ Click a point to see more details.", style={"color": "#888"})
|
217 |
+
img_url, title, desc = clickData["points"][0]["customdata"]
|
218 |
+
return html.Div([
|
219 |
+
html.H4(title),
|
220 |
+
html.Img(src=img_url, style={"maxWidth": "100%", "marginBottom": "10px"}),
|
221 |
+
html.P(desc or "No description available.")
|
222 |
+
])
|
223 |
+
|
224 |
+
# -------------------- Utility --------------------
|
225 |
+
|
226 |
+
def harvest_omeka_items(client, item_set_id=None, per_page=50):
|
227 |
+
"""
|
228 |
+
Fetch and parse items from Omeka S.
|
229 |
+
Args:
|
230 |
+
client: OmekaSClient instance
|
231 |
+
item_set_id: ID of the item set to fetch items from (optional)
|
232 |
+
per_page: Number of items to fetch per page (default: 50)
|
233 |
+
Returns:
|
234 |
+
DataFrame containing parsed item data
|
235 |
+
"""
|
236 |
+
print("\n--- Fetching and Parsing Multiple Items by colection---")
|
237 |
+
try:
|
238 |
+
# Fetch first 5 items
|
239 |
+
items_list = client.list_all_items(item_set_id=item_set_id, per_page=per_page)
|
240 |
+
print(items_list)
|
241 |
+
print(f"Fetched {len(items_list)} items.")
|
242 |
+
|
243 |
+
parsed_items_list = []
|
244 |
+
for item_raw in items_list:
|
245 |
+
if 'o:media' in item_raw:
|
246 |
+
parsed = client.digest_item_data(item_raw, prefixes=_DEFAULT_PARSE_METADATA)
|
247 |
+
if parsed: # Only add if parsing was successful
|
248 |
+
# Add media
|
249 |
+
medias_id = [x["o:id"] for x in item_raw["o:media"]]
|
250 |
+
medias_list = []
|
251 |
+
for media_id in medias_id:
|
252 |
+
media = client.get_media(media_id)
|
253 |
+
if "image" in media["o:media_type"]:
|
254 |
+
medias_list.append(media.get('o:original_url'))
|
255 |
+
if medias_list: # Only append if there are image URLs
|
256 |
+
parsed["images_urls"] = medias_list
|
257 |
+
parsed_items_list.append(parsed)
|
258 |
+
print(f"Successfully parsed {len(parsed_items_list)} items.")
|
259 |
+
|
260 |
+
print(f"Successfully parsed {len(parsed_items_list)} items.")
|
261 |
+
# Note: List columns (like dcterms:title) might need further handling in Pandas
|
262 |
+
print("\nDataFrame from parsed items:")
|
263 |
+
return pd.DataFrame(parsed_items_list)
|
264 |
+
except OmekaSClientError as e:
|
265 |
+
print(f"Error fetching/parsing multiple items: {e}")
|
266 |
+
except Exception as e:
|
267 |
+
print(f"An unexpected error occurred during multi-item parsing: {e}")
|
268 |
+
|
269 |
+
def create_umap_plot(df):
|
270 |
+
coords = np.array(df["umap_embeddings"].tolist())
|
271 |
+
fig = px.scatter(
|
272 |
+
df, x=coords[:, 0], y=coords[:, 1],
|
273 |
+
color="Topic",
|
274 |
+
custom_data=["images_urls", "Title", "Description"],
|
275 |
+
hover_data=None,
|
276 |
+
title="UMAP Projection with HDBSCAN Topics"
|
277 |
+
)
|
278 |
+
fig.update_traces(
|
279 |
+
marker=dict(size=8, line=dict(width=1, color="DarkSlateGrey")),
|
280 |
+
hovertemplate="<b>%{customdata[1]}</b><br><img src='%{customdata[0]}' height='150'><extra></extra>"
|
281 |
+
)
|
282 |
+
fig.update_layout(height=700, margin=dict(t=30, b=30, l=30, r=30))
|
283 |
+
return fig, f"Loaded {len(df)} items and projected into 2D."
|
284 |
+
|
285 |
+
if __name__ == "__main__":
|
286 |
+
app.run(debug=True)
|
helpers.py
ADDED
@@ -0,0 +1,146 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import AutoTokenizer, AutoModel, AutoImageProcessor
|
2 |
+
from sentence_transformers import SentenceTransformer
|
3 |
+
import torch
|
4 |
+
import torch.nn.functional as F
|
5 |
+
from PIL import Image
|
6 |
+
import requests
|
7 |
+
import os
|
8 |
+
import json
|
9 |
+
import math
|
10 |
+
import re
|
11 |
+
import pandas as pd
|
12 |
+
import numpy as np
|
13 |
+
from omeka_s_api_client import OmekaSClient,OmekaSClientError
|
14 |
+
from typing import List, Dict, Any, Union
|
15 |
+
import io
|
16 |
+
from dotenv import load_dotenv
|
17 |
+
|
18 |
+
# env var
|
19 |
+
load_dotenv(os.path.join(os.getcwd(), ".env"))
|
20 |
+
HF_TOKEN = os.environ.get("HF_TOKEN")
|
21 |
+
|
22 |
+
# Nomic vison model
|
23 |
+
processor = AutoImageProcessor.from_pretrained("nomic-ai/nomic-embed-vision-v1.5")
|
24 |
+
vision_model = AutoModel.from_pretrained("nomic-ai/nomic-embed-vision-v1.5", trust_remote_code=True)
|
25 |
+
|
26 |
+
# Nomic text model
|
27 |
+
text_model = SentenceTransformer("nomic-ai/nomic-embed-text-v1.5", trust_remote_code=True, token=HF_TOKEN)
|
28 |
+
|
29 |
+
def image_url_to_pil(url: str, max_size=(512, 512)) -> Image:
|
30 |
+
"""
|
31 |
+
Ex usage : image_blobs = df["image_url"].apply(image_url_to_pil).tolist()
|
32 |
+
"""
|
33 |
+
response = requests.get(url, stream=True, timeout=5)
|
34 |
+
response.raise_for_status()
|
35 |
+
image = Image.open(io.BytesIO(response.content)).convert("RGB")
|
36 |
+
image.thumbnail(max_size, Image.Resampling.LANCZOS)
|
37 |
+
return image
|
38 |
+
|
39 |
+
def generate_img_embed(images_urls, batch_size=20):
|
40 |
+
"""Generate image embeddings in batches to manage memory usage.
|
41 |
+
|
42 |
+
Args:
|
43 |
+
images_urls (list): List of image URLs
|
44 |
+
batch_size (int): Number of images to process at once
|
45 |
+
"""
|
46 |
+
all_embeddings = []
|
47 |
+
|
48 |
+
for i in range(0, len(images_urls), batch_size):
|
49 |
+
batch_urls = images_urls[i:i + batch_size]
|
50 |
+
images = [image_url_to_pil(image_url) for image_url in batch_urls]
|
51 |
+
inputs = processor(images, return_tensors="pt")
|
52 |
+
img_emb = vision_model(**inputs).last_hidden_state
|
53 |
+
img_embeddings = F.normalize(img_emb[:, 0], p=2, dim=1)
|
54 |
+
all_embeddings.append(img_embeddings.detach().numpy())
|
55 |
+
|
56 |
+
return np.vstack(all_embeddings)
|
57 |
+
|
58 |
+
def generate_text_embed(sentences: List, batch_size=64):
|
59 |
+
"""Generate text embeddings in batches to manage memory usage.
|
60 |
+
|
61 |
+
Args:
|
62 |
+
sentences (List): List of text strings to encode
|
63 |
+
batch_size (int): Number of sentences to process at once
|
64 |
+
"""
|
65 |
+
all_embeddings = []
|
66 |
+
|
67 |
+
for i in range(0, len(sentences), batch_size):
|
68 |
+
batch_sentences = sentences[i:i + batch_size]
|
69 |
+
embeddings = text_model.encode(batch_sentences)
|
70 |
+
all_embeddings.append(embeddings)
|
71 |
+
|
72 |
+
return np.vstack(all_embeddings)
|
73 |
+
|
74 |
+
def add_concatenated_text_field_exclude_keys(item_dict, keys_to_exclude=None, text_field_key="text", pair_separator=" - "):
|
75 |
+
if not isinstance(item_dict, dict):
|
76 |
+
raise TypeError("Input must be a dictionary.")
|
77 |
+
if keys_to_exclude is None:
|
78 |
+
keys_to_exclude = set() # Default to empty set
|
79 |
+
else:
|
80 |
+
keys_to_exclude = set(keys_to_exclude) # Ensure it's a set for efficient lookup
|
81 |
+
|
82 |
+
# Add the target text key to the exclusion set automatically
|
83 |
+
keys_to_exclude.add(text_field_key)
|
84 |
+
|
85 |
+
formatted_pairs = []
|
86 |
+
for key, value in item_dict.items():
|
87 |
+
# 1. Skip any key in the exclusion set
|
88 |
+
if key in keys_to_exclude:
|
89 |
+
continue
|
90 |
+
|
91 |
+
# 2. Check for empty/invalid values (same logic as before)
|
92 |
+
is_empty_or_invalid = False
|
93 |
+
if value is None: is_empty_or_invalid = True
|
94 |
+
elif isinstance(value, float) and math.isnan(value): is_empty_or_invalid = True
|
95 |
+
elif isinstance(value, (str, list, tuple, dict)) and len(value) == 0: is_empty_or_invalid = True
|
96 |
+
|
97 |
+
# 3. Format and add if valid
|
98 |
+
if not is_empty_or_invalid:
|
99 |
+
formatted_pairs.append(f"{str(key)}: {str(value)}")
|
100 |
+
|
101 |
+
concatenated_text = f"search_document: {pair_separator.join(formatted_pairs)}"
|
102 |
+
item_dict[text_field_key] = concatenated_text
|
103 |
+
return item_dict
|
104 |
+
|
105 |
+
def prepare_df_atlas(df: pd.DataFrame, id_col='id', images_col='images_urls'):
|
106 |
+
|
107 |
+
# Drop completely empty columns
|
108 |
+
#df = df.dropna(axis=1, how='all')
|
109 |
+
|
110 |
+
# Fill remaining nulls with empty strings
|
111 |
+
#df = df.fillna('')
|
112 |
+
|
113 |
+
# Ensure ID column exists
|
114 |
+
if id_col not in df.columns:
|
115 |
+
df[id_col] = [f'{i}' for i in range(len(df))]
|
116 |
+
|
117 |
+
# Ensure indexed field exists and is not empty
|
118 |
+
#if indexed_col not in df.columns:
|
119 |
+
# df[indexed_col] = ''
|
120 |
+
|
121 |
+
#df[images_col] = df[images_col].apply(lambda x: [x[0]] if isinstance(x, list) and len(x) > 1 else x if isinstance(x, list) else [x])
|
122 |
+
df[images_col] = df[images_col].apply(lambda x: x[0] if isinstance(x, list) else x)
|
123 |
+
|
124 |
+
# Optional: force all to string (can help with weird dtypes)
|
125 |
+
for col in df.columns:
|
126 |
+
df[col] = df[col].astype(str)
|
127 |
+
|
128 |
+
return df
|
129 |
+
|
130 |
+
def remove_key_value_from_dict(list_of_dict, key_to_remove):
|
131 |
+
new_list = []
|
132 |
+
for dictionary in list_of_dict:
|
133 |
+
new_dict = dictionary.copy() # Create a copy to avoid modifying the original list
|
134 |
+
if key_to_remove in new_dict:
|
135 |
+
del new_dict[key_to_remove]
|
136 |
+
new_list.append(new_dict)
|
137 |
+
return new_list
|
138 |
+
|
139 |
+
def remove_key_value_from_dict(input_dict, key_to_remove='text'):
|
140 |
+
if not isinstance(input_dict, dict):
|
141 |
+
raise TypeError("Input must be a dictionary.")
|
142 |
+
|
143 |
+
if key_to_remove in input_dict:
|
144 |
+
del input_dict[key_to_remove]
|
145 |
+
|
146 |
+
return input_dict
|
lancedb_client.py
ADDED
@@ -0,0 +1,100 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import lancedb
|
3 |
+
import pyarrow as pa
|
4 |
+
import logging
|
5 |
+
from dotenv import load_dotenv
|
6 |
+
import os
|
7 |
+
import ast
|
8 |
+
|
9 |
+
# Load env vars
|
10 |
+
load_dotenv(os.path.join(os.getcwd(), ".env"),override = True)
|
11 |
+
metadata_keys_raw = os.getenv("_DEFAULT_PARSE_METADATA", "").split(",")
|
12 |
+
metadata_keys = [key.replace(" ", "").replace(")", "").strip("'") for key in metadata_keys_raw]
|
13 |
+
|
14 |
+
# Setup logger
|
15 |
+
logging.basicConfig(level=logging.INFO)
|
16 |
+
logger = logging.getLogger(__name__)
|
17 |
+
|
18 |
+
|
19 |
+
class LanceDBManager:
|
20 |
+
|
21 |
+
def __init__(self, db_uri="lancedb", embedding_dim=1536):
|
22 |
+
self.db = lancedb.connect(db_uri)
|
23 |
+
self.embedding_dim = embedding_dim
|
24 |
+
logger.info(f"Connected to LanceDB at {db_uri}")
|
25 |
+
|
26 |
+
def _build_schema(self):
|
27 |
+
"""Build LanceDB schema with dynamic metadata fields and embedding vector."""
|
28 |
+
fields = [
|
29 |
+
pa.field("id", pa.int64()),
|
30 |
+
pa.field("item_id", pa.string()),
|
31 |
+
pa.field("images_urls", pa.string()),
|
32 |
+
pa.field("text", pa.string()),
|
33 |
+
pa.field("Cluster", pa.string()),
|
34 |
+
pa.field("Topic", pa.string()),
|
35 |
+
pa.field("embeddings", pa.list_(pa.float32(), self.embedding_dim)),
|
36 |
+
pa.field("umap_embeddings", pa.list_(pa.float32(), 2)),
|
37 |
+
]
|
38 |
+
|
39 |
+
# Add fields from metadata
|
40 |
+
for key in metadata_keys:
|
41 |
+
sanitized_key = key.split(":")[1].strip().capitalize() # remove the vocabulary prefix in key label and capitalize
|
42 |
+
fields.append(pa.field(sanitized_key, pa.string()))
|
43 |
+
|
44 |
+
return pa.schema(fields)
|
45 |
+
|
46 |
+
def create_table(self, table_name):
|
47 |
+
"""Create table using dynamic schema."""
|
48 |
+
try:
|
49 |
+
schema = self._build_schema()
|
50 |
+
table = self.db.create_table(table_name, schema=schema)
|
51 |
+
logger.info(f"Created LanceDB table '{table_name}'")
|
52 |
+
return table
|
53 |
+
except Exception as e:
|
54 |
+
logger.error(f"Failed to create table '{table_name}': {e}")
|
55 |
+
raise
|
56 |
+
|
57 |
+
def retrieve_table(self, table_name):
|
58 |
+
try:
|
59 |
+
table = self.db.open_table(table_name)
|
60 |
+
logger.info(f"Opened existing LanceDB table '{table_name}'")
|
61 |
+
return table
|
62 |
+
except Exception as e:
|
63 |
+
logger.error(f"Failed to open table '{table_name}': {e}")
|
64 |
+
raise
|
65 |
+
|
66 |
+
def initialize_table(self, table_name):
|
67 |
+
try:
|
68 |
+
return self.retrieve_table(table_name)
|
69 |
+
except Exception:
|
70 |
+
logger.info(f"Table '{table_name}' not found. Creating new.")
|
71 |
+
return self.create_table(table_name)
|
72 |
+
|
73 |
+
def add_entry(self, table_name, items):
|
74 |
+
table = self.initialize_table(table_name)
|
75 |
+
table.add(items)
|
76 |
+
logger.info(f"Added items to table '{table_name}'")
|
77 |
+
|
78 |
+
def list_tables(self):
|
79 |
+
"""List all existing tables in the LanceDB instance."""
|
80 |
+
try:
|
81 |
+
tables = self.db.table_names()
|
82 |
+
logger.info("Retrieved list of tables.")
|
83 |
+
return tables
|
84 |
+
except Exception as e:
|
85 |
+
logger.error(f"Failed to list tables: {e}")
|
86 |
+
raise
|
87 |
+
|
88 |
+
def get_content_table(self, table_name):
|
89 |
+
table = self.initialize_table(table_name)
|
90 |
+
return table.to_pandas()
|
91 |
+
|
92 |
+
def drop_table(self, table_name):
|
93 |
+
"""remove an existing table by name."""
|
94 |
+
try:
|
95 |
+
table = self.db.drop_table(table_name)
|
96 |
+
logger.info(f"Remove existing LanceDB table '{table_name}' successfully.")
|
97 |
+
return table
|
98 |
+
except Exception as e:
|
99 |
+
logger.error(f"Failed to remove existing table '{table_name}': {e}")
|
100 |
+
raise
|
requirements.txt
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
pandas
|
2 |
+
numpy
|
3 |
+
dash
|
4 |
+
dash-bootstrap-components
|
5 |
+
plotly
|
6 |
+
datasets
|
7 |
+
pillow
|
8 |
+
transformers
|
9 |
+
sentence-transformers
|
10 |
+
einops
|
11 |
+
lancedb
|
12 |
+
requests
|
13 |
+
umap-learn
|
14 |
+
scikit-learn
|
15 |
+
hdbscan
|
16 |
+
git+https://github.com/gegedenice/omeka-s-api-client.git
|
17 |
+
python-dotenv
|
18 |
+
gunicorn
|