Geraldine commited on
Commit
97226b8
·
verified ·
1 Parent(s): 2d31bce

Upload 6 files

Browse files
Files changed (6) hide show
  1. .env +2 -0
  2. Dockerfile +12 -0
  3. app.py +286 -0
  4. helpers.py +146 -0
  5. lancedb_client.py +100 -0
  6. requirements.txt +18 -0
.env ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ _DEFAULT_PARSE_METADATA=('dcterms:identifier','dcterms:type','dcterms:title', 'dcterms:description','dcterms:creator','dcterms:publisher','dcterms:date','dcterms:spatial','dcterms:format','dcterms:provenance','dcterms:subject','dcterms:medium','bibo:annotates','bibo:content', 'bibo:locator', 'bibo:owner')
2
+ HF_TOKEN=""
Dockerfile ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # syntax=docker/dockerfile:1
2
+ FROM python:3.11-slim
3
+ RUN useradd -m -u 1000 user
4
+ USER user
5
+ ENV PATH="/home/user/.local/bin:$PATH"
6
+ WORKDIR /app
7
+ COPY --chown=user ./requirements.txt requirements.txt
8
+ RUN pip install --upgrade pip
9
+ RUN pip install --no-cache-dir --upgrade -r requirements.txt
10
+ COPY --chown=user . /app
11
+ EXPOSE 8050
12
+ CMD gunicorn --workers 5 --threads 2 -b 0.0.0.0:8050 --timeout 0 app:server
app.py ADDED
@@ -0,0 +1,286 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import dash
2
+ from dash import dcc, html, Input, Output, State, ctx
3
+ import dash_bootstrap_components as dbc
4
+ import plotly.express as px
5
+ import pandas as pd
6
+ import numpy as np
7
+ import umap
8
+ import hdbscan
9
+ import sklearn.feature_extraction.text as text
10
+ from dash.exceptions import PreventUpdate
11
+ import os
12
+ from dotenv import load_dotenv
13
+ import helpers
14
+ import lancedb
15
+ from omeka_s_api_client import OmekaSClient, OmekaSClientError
16
+ from lancedb_client import LanceDBManager
17
+
18
+ # Load .env for credentials
19
+ load_dotenv()
20
+ _DEFAULT_PARSE_METADATA = (
21
+ 'dcterms:identifier','dcterms:type','dcterms:title', 'dcterms:description',
22
+ 'dcterms:creator','dcterms:publisher','dcterms:date','dcterms:spatial',
23
+ 'dcterms:format','dcterms:provenance','dcterms:subject','dcterms:medium',
24
+ 'bibo:annotates','bibo:content', 'bibo:locator', 'bibo:owner'
25
+ )
26
+
27
+ app = dash.Dash(__name__, external_stylesheets=[dbc.themes.BOOTSTRAP])
28
+ app.config.suppress_callback_exceptions = True
29
+ server = app.server
30
+ manager = LanceDBManager()
31
+
32
+ french_stopwords = text.ENGLISH_STOP_WORDS.union([
33
+ "alors", "au", "aucuns", "aussi", "autre", "avant", "avec", "avoir", "bon",
34
+ "car", "ce", "cela", "ces", "ceux", "chaque", "ci", "comme", "comment", "dans",
35
+ "des", "du", "dedans", "dehors", "depuis", "devrait", "doit", "donc", "dos",
36
+ "début", "elle", "elles", "en", "encore", "essai", "est", "et", "eu", "fait",
37
+ "faites", "fois", "font", "hors", "ici", "il", "ils", "je", "juste", "la", "le",
38
+ "les", "leur", "là", "ma", "maintenant", "mais", "mes", "mine", "moins", "mon",
39
+ "mot", "même", "ni", "nommés", "notre", "nous", "nouveaux", "ou", "où", "par",
40
+ "parce", "parole", "pas", "personnes", "peut", "peu", "pièce", "plupart", "pour",
41
+ "pourquoi", "quand", "que", "quel", "quelle", "quelles", "quels", "qui", "sa",
42
+ "sans", "ses", "seulement", "si", "sien", "son", "sont", "sous", "soyez", "sujet",
43
+ "sur", "ta", "tandis", "tellement", "tels", "tes", "ton", "tous", "tout", "trop",
44
+ "très", "tu", "valeur", "voie", "voient", "vont", "votre", "vous", "vu", "ça",
45
+ "étaient", "état", "étions", "été", "être"
46
+ ])
47
+
48
+ # -------------------- Layout --------------------
49
+ app.layout = dbc.Container([
50
+ html.H2("🌍 Omeka S UMAP Explorer", className="text-center mt-4"),
51
+ html.Hr(),
52
+
53
+ # Input controls
54
+ dbc.Row([
55
+ dbc.Col([
56
+ html.H5("🔍 From Omeka S"),
57
+ dcc.Input(id="api-url", value="https://your-omeka-instance.org", type="text", className="form-control"),
58
+ dbc.Button("Load Item Sets", id="load-sets", color="secondary", className="mt-2"),
59
+ dcc.Dropdown(id="items-sets-dropdown", placeholder="Select a collection"),
60
+ dcc.Input(id="table-name", value="my_table", type="text", className="form-control mt-2", placeholder="New table name"),
61
+ dbc.Button("Process Omeka Collection", id="load-data", color="primary", className="mt-2"),
62
+ ], md=4),
63
+
64
+ dbc.Col([
65
+ html.H5("📁 From LanceDB"),
66
+ dbc.Button("Load Existing Tables", id="load-tables", color="info"),
67
+ dcc.Dropdown(id="db-tables-dropdown", placeholder="Select an existing table"),
68
+ dbc.Button("Display Table", id="load-data-db", color="success", className="mt-2"),
69
+ ], md=4),
70
+
71
+ dbc.Col([
72
+ html.H5("🔎 Query Tool (coming soon)"),
73
+ dbc.Input(placeholder="Type a search query...", type="text", disabled=True),
74
+ ], md=4),
75
+ ], className="mb-4"),
76
+
77
+ # Main plot area and metadata side panel
78
+ dbc.Row([
79
+ dbc.Col(
80
+ dcc.Graph(id="umap-graph", style={"height": "700px"}),
81
+ md=8
82
+ ),
83
+ dbc.Col(
84
+ html.Div(id="point-details", style={
85
+ "padding": "15px",
86
+ "borderLeft": "1px solid #ccc",
87
+ "height": "700px",
88
+ "overflowY": "auto"
89
+ }),
90
+ md=4
91
+ ),
92
+ ]),
93
+
94
+ # Status/info
95
+ html.Div(id="status", className="mt-3"),
96
+
97
+ dcc.Store(id="omeka-client-config", storage_type="session")
98
+ ], fluid=True)
99
+
100
+ # -------------------- Callbacks --------------------
101
+
102
+ @app.callback(
103
+ Output("items-sets-dropdown", "options"),
104
+ Output("omeka-client-config", "data"),
105
+ Input("load-sets", "n_clicks"),
106
+ State("api-url", "value"),
107
+ prevent_initial_call=True
108
+ )
109
+ def load_item_sets(n, base_url):
110
+ client = OmekaSClient(base_url, "...", "...", 50)
111
+ try:
112
+ item_sets = client.list_all_item_sets()
113
+ options = [{"label": s.get('dcterms:title', [{}])[0].get('@value', 'N/A'), "value": s["o:id"]} for s in item_sets]
114
+ return options, {
115
+ "base_url": base_url,
116
+ "key_identity": "...",
117
+ "key_credential": "...",
118
+ "default_per_page": 50
119
+ }
120
+ except Exception as e:
121
+ return dash.no_update, dash.no_update
122
+
123
+ @app.callback(
124
+ Output("db-tables-dropdown", "options"),
125
+ Input("load-tables", "n_clicks"),
126
+ prevent_initial_call=True
127
+ )
128
+ def list_tables(n):
129
+ return [{"label": t, "value": t} for t in manager.list_tables()]
130
+
131
+ @app.callback(
132
+ Output("umap-graph", "figure"),
133
+ Output("status", "children"),
134
+ Input("load-data", "n_clicks"), # From Omeka S
135
+ Input("load-data-db", "n_clicks"), # From DB table
136
+ State("items-sets-dropdown", "value"),
137
+ State("omeka-client-config", "data"),
138
+ State("table-name", "value"),
139
+ State("db-tables-dropdown", "value"),
140
+ prevent_initial_call=True
141
+ )
142
+ def handle_data_loading(n_clicks_omeka, n_clicks_db, item_set_id, client_config, table_name, db_table):
143
+ triggered_id = ctx.triggered_id
144
+ print(triggered_id)
145
+
146
+ if triggered_id == "load-data": # Omeka S case
147
+ if not client_config:
148
+ raise PreventUpdate
149
+
150
+ client = OmekaSClient(
151
+ base_url=client_config["base_url"],
152
+ key_identity=client_config["key_identity"],
153
+ key_credential=client_config["key_credential"]
154
+ )
155
+
156
+ df_omeka = harvest_omeka_items(client, item_set_id=item_set_id)
157
+ items = df_omeka.to_dict(orient="records")
158
+ records_with_text = [helpers.add_concatenated_text_field_exclude_keys(item, keys_to_exclude=['id','images_urls'], text_field_key='text', pair_separator=' - ') for item in items]
159
+ df = helpers.prepare_df_atlas(pd.DataFrame(records_with_text), id_col='id', images_col='images_urls')
160
+
161
+ text_embed = helpers.generate_text_embed(df['text'].tolist())
162
+ img_embed = helpers.generate_img_embed(df['images_urls'].tolist())
163
+ embeddings = np.concatenate([text_embed, img_embed], axis=1)
164
+ df["embeddings"] = embeddings.tolist()
165
+
166
+ reducer = umap.UMAP(n_neighbors=15, min_dist=0.1, metric="cosine")
167
+ umap_embeddings = reducer.fit_transform(embeddings)
168
+ df["umap_embeddings"] = umap_embeddings.tolist()
169
+
170
+ clusterer = hdbscan.HDBSCAN(min_cluster_size=10)
171
+ cluster_labels = clusterer.fit_predict(umap_embeddings)
172
+ df["Cluster"] = cluster_labels
173
+
174
+ vectorizer = text.TfidfVectorizer(max_features=1000, stop_words=list(french_stopwords), lowercase=True)
175
+ tfidf_matrix = vectorizer.fit_transform(df["text"].astype(str).tolist())
176
+ top_words = []
177
+ for label in sorted(df["Cluster"].unique()):
178
+ if label == -1:
179
+ top_words.append("Noise")
180
+ continue
181
+ mask = (df["Cluster"] == label).to_numpy().nonzero()[0]
182
+ cluster_docs = tfidf_matrix[mask]
183
+ mean_tfidf = cluster_docs.mean(axis=0)
184
+ mean_tfidf = np.asarray(mean_tfidf).flatten()
185
+ top_indices = mean_tfidf.argsort()[::-1][:5]
186
+ terms = [vectorizer.get_feature_names_out()[i] for i in top_indices]
187
+ top_words.append(", ".join(terms))
188
+ cluster_name_map = {label: name for label, name in zip(sorted(df["Cluster"].unique()), top_words)}
189
+ df["Topic"] = df["Cluster"].map(cluster_name_map)
190
+
191
+ manager.initialize_table(table_name)
192
+ manager.add_entry(table_name, df.to_dict(orient="records"))
193
+
194
+ elif triggered_id == "load-data-db": # Load existing LanceDB table
195
+ if not db_table:
196
+ raise PreventUpdate
197
+ items = manager.get_content_table(db_table)
198
+ df = pd.DataFrame(items)
199
+ df = df.dropna(axis=1, how='all')
200
+ df = df.fillna('')
201
+ #umap_embeddings = np.array(df["umap_embeddings"].tolist())
202
+
203
+ else:
204
+ raise PreventUpdate
205
+
206
+ # Plotting
207
+ return create_umap_plot(df)
208
+
209
+
210
+ @app.callback(
211
+ Output("point-details", "children"),
212
+ Input("umap-graph", "clickData")
213
+ )
214
+ def show_point_details(clickData):
215
+ if not clickData:
216
+ return html.Div("🖱️ Click a point to see more details.", style={"color": "#888"})
217
+ img_url, title, desc = clickData["points"][0]["customdata"]
218
+ return html.Div([
219
+ html.H4(title),
220
+ html.Img(src=img_url, style={"maxWidth": "100%", "marginBottom": "10px"}),
221
+ html.P(desc or "No description available.")
222
+ ])
223
+
224
+ # -------------------- Utility --------------------
225
+
226
+ def harvest_omeka_items(client, item_set_id=None, per_page=50):
227
+ """
228
+ Fetch and parse items from Omeka S.
229
+ Args:
230
+ client: OmekaSClient instance
231
+ item_set_id: ID of the item set to fetch items from (optional)
232
+ per_page: Number of items to fetch per page (default: 50)
233
+ Returns:
234
+ DataFrame containing parsed item data
235
+ """
236
+ print("\n--- Fetching and Parsing Multiple Items by colection---")
237
+ try:
238
+ # Fetch first 5 items
239
+ items_list = client.list_all_items(item_set_id=item_set_id, per_page=per_page)
240
+ print(items_list)
241
+ print(f"Fetched {len(items_list)} items.")
242
+
243
+ parsed_items_list = []
244
+ for item_raw in items_list:
245
+ if 'o:media' in item_raw:
246
+ parsed = client.digest_item_data(item_raw, prefixes=_DEFAULT_PARSE_METADATA)
247
+ if parsed: # Only add if parsing was successful
248
+ # Add media
249
+ medias_id = [x["o:id"] for x in item_raw["o:media"]]
250
+ medias_list = []
251
+ for media_id in medias_id:
252
+ media = client.get_media(media_id)
253
+ if "image" in media["o:media_type"]:
254
+ medias_list.append(media.get('o:original_url'))
255
+ if medias_list: # Only append if there are image URLs
256
+ parsed["images_urls"] = medias_list
257
+ parsed_items_list.append(parsed)
258
+ print(f"Successfully parsed {len(parsed_items_list)} items.")
259
+
260
+ print(f"Successfully parsed {len(parsed_items_list)} items.")
261
+ # Note: List columns (like dcterms:title) might need further handling in Pandas
262
+ print("\nDataFrame from parsed items:")
263
+ return pd.DataFrame(parsed_items_list)
264
+ except OmekaSClientError as e:
265
+ print(f"Error fetching/parsing multiple items: {e}")
266
+ except Exception as e:
267
+ print(f"An unexpected error occurred during multi-item parsing: {e}")
268
+
269
+ def create_umap_plot(df):
270
+ coords = np.array(df["umap_embeddings"].tolist())
271
+ fig = px.scatter(
272
+ df, x=coords[:, 0], y=coords[:, 1],
273
+ color="Topic",
274
+ custom_data=["images_urls", "Title", "Description"],
275
+ hover_data=None,
276
+ title="UMAP Projection with HDBSCAN Topics"
277
+ )
278
+ fig.update_traces(
279
+ marker=dict(size=8, line=dict(width=1, color="DarkSlateGrey")),
280
+ hovertemplate="<b>%{customdata[1]}</b><br><img src='%{customdata[0]}' height='150'><extra></extra>"
281
+ )
282
+ fig.update_layout(height=700, margin=dict(t=30, b=30, l=30, r=30))
283
+ return fig, f"Loaded {len(df)} items and projected into 2D."
284
+
285
+ if __name__ == "__main__":
286
+ app.run(debug=True)
helpers.py ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer, AutoModel, AutoImageProcessor
2
+ from sentence_transformers import SentenceTransformer
3
+ import torch
4
+ import torch.nn.functional as F
5
+ from PIL import Image
6
+ import requests
7
+ import os
8
+ import json
9
+ import math
10
+ import re
11
+ import pandas as pd
12
+ import numpy as np
13
+ from omeka_s_api_client import OmekaSClient,OmekaSClientError
14
+ from typing import List, Dict, Any, Union
15
+ import io
16
+ from dotenv import load_dotenv
17
+
18
+ # env var
19
+ load_dotenv(os.path.join(os.getcwd(), ".env"))
20
+ HF_TOKEN = os.environ.get("HF_TOKEN")
21
+
22
+ # Nomic vison model
23
+ processor = AutoImageProcessor.from_pretrained("nomic-ai/nomic-embed-vision-v1.5")
24
+ vision_model = AutoModel.from_pretrained("nomic-ai/nomic-embed-vision-v1.5", trust_remote_code=True)
25
+
26
+ # Nomic text model
27
+ text_model = SentenceTransformer("nomic-ai/nomic-embed-text-v1.5", trust_remote_code=True, token=HF_TOKEN)
28
+
29
+ def image_url_to_pil(url: str, max_size=(512, 512)) -> Image:
30
+ """
31
+ Ex usage : image_blobs = df["image_url"].apply(image_url_to_pil).tolist()
32
+ """
33
+ response = requests.get(url, stream=True, timeout=5)
34
+ response.raise_for_status()
35
+ image = Image.open(io.BytesIO(response.content)).convert("RGB")
36
+ image.thumbnail(max_size, Image.Resampling.LANCZOS)
37
+ return image
38
+
39
+ def generate_img_embed(images_urls, batch_size=20):
40
+ """Generate image embeddings in batches to manage memory usage.
41
+
42
+ Args:
43
+ images_urls (list): List of image URLs
44
+ batch_size (int): Number of images to process at once
45
+ """
46
+ all_embeddings = []
47
+
48
+ for i in range(0, len(images_urls), batch_size):
49
+ batch_urls = images_urls[i:i + batch_size]
50
+ images = [image_url_to_pil(image_url) for image_url in batch_urls]
51
+ inputs = processor(images, return_tensors="pt")
52
+ img_emb = vision_model(**inputs).last_hidden_state
53
+ img_embeddings = F.normalize(img_emb[:, 0], p=2, dim=1)
54
+ all_embeddings.append(img_embeddings.detach().numpy())
55
+
56
+ return np.vstack(all_embeddings)
57
+
58
+ def generate_text_embed(sentences: List, batch_size=64):
59
+ """Generate text embeddings in batches to manage memory usage.
60
+
61
+ Args:
62
+ sentences (List): List of text strings to encode
63
+ batch_size (int): Number of sentences to process at once
64
+ """
65
+ all_embeddings = []
66
+
67
+ for i in range(0, len(sentences), batch_size):
68
+ batch_sentences = sentences[i:i + batch_size]
69
+ embeddings = text_model.encode(batch_sentences)
70
+ all_embeddings.append(embeddings)
71
+
72
+ return np.vstack(all_embeddings)
73
+
74
+ def add_concatenated_text_field_exclude_keys(item_dict, keys_to_exclude=None, text_field_key="text", pair_separator=" - "):
75
+ if not isinstance(item_dict, dict):
76
+ raise TypeError("Input must be a dictionary.")
77
+ if keys_to_exclude is None:
78
+ keys_to_exclude = set() # Default to empty set
79
+ else:
80
+ keys_to_exclude = set(keys_to_exclude) # Ensure it's a set for efficient lookup
81
+
82
+ # Add the target text key to the exclusion set automatically
83
+ keys_to_exclude.add(text_field_key)
84
+
85
+ formatted_pairs = []
86
+ for key, value in item_dict.items():
87
+ # 1. Skip any key in the exclusion set
88
+ if key in keys_to_exclude:
89
+ continue
90
+
91
+ # 2. Check for empty/invalid values (same logic as before)
92
+ is_empty_or_invalid = False
93
+ if value is None: is_empty_or_invalid = True
94
+ elif isinstance(value, float) and math.isnan(value): is_empty_or_invalid = True
95
+ elif isinstance(value, (str, list, tuple, dict)) and len(value) == 0: is_empty_or_invalid = True
96
+
97
+ # 3. Format and add if valid
98
+ if not is_empty_or_invalid:
99
+ formatted_pairs.append(f"{str(key)}: {str(value)}")
100
+
101
+ concatenated_text = f"search_document: {pair_separator.join(formatted_pairs)}"
102
+ item_dict[text_field_key] = concatenated_text
103
+ return item_dict
104
+
105
+ def prepare_df_atlas(df: pd.DataFrame, id_col='id', images_col='images_urls'):
106
+
107
+ # Drop completely empty columns
108
+ #df = df.dropna(axis=1, how='all')
109
+
110
+ # Fill remaining nulls with empty strings
111
+ #df = df.fillna('')
112
+
113
+ # Ensure ID column exists
114
+ if id_col not in df.columns:
115
+ df[id_col] = [f'{i}' for i in range(len(df))]
116
+
117
+ # Ensure indexed field exists and is not empty
118
+ #if indexed_col not in df.columns:
119
+ # df[indexed_col] = ''
120
+
121
+ #df[images_col] = df[images_col].apply(lambda x: [x[0]] if isinstance(x, list) and len(x) > 1 else x if isinstance(x, list) else [x])
122
+ df[images_col] = df[images_col].apply(lambda x: x[0] if isinstance(x, list) else x)
123
+
124
+ # Optional: force all to string (can help with weird dtypes)
125
+ for col in df.columns:
126
+ df[col] = df[col].astype(str)
127
+
128
+ return df
129
+
130
+ def remove_key_value_from_dict(list_of_dict, key_to_remove):
131
+ new_list = []
132
+ for dictionary in list_of_dict:
133
+ new_dict = dictionary.copy() # Create a copy to avoid modifying the original list
134
+ if key_to_remove in new_dict:
135
+ del new_dict[key_to_remove]
136
+ new_list.append(new_dict)
137
+ return new_list
138
+
139
+ def remove_key_value_from_dict(input_dict, key_to_remove='text'):
140
+ if not isinstance(input_dict, dict):
141
+ raise TypeError("Input must be a dictionary.")
142
+
143
+ if key_to_remove in input_dict:
144
+ del input_dict[key_to_remove]
145
+
146
+ return input_dict
lancedb_client.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import lancedb
3
+ import pyarrow as pa
4
+ import logging
5
+ from dotenv import load_dotenv
6
+ import os
7
+ import ast
8
+
9
+ # Load env vars
10
+ load_dotenv(os.path.join(os.getcwd(), ".env"),override = True)
11
+ metadata_keys_raw = os.getenv("_DEFAULT_PARSE_METADATA", "").split(",")
12
+ metadata_keys = [key.replace(" ", "").replace(")", "").strip("'") for key in metadata_keys_raw]
13
+
14
+ # Setup logger
15
+ logging.basicConfig(level=logging.INFO)
16
+ logger = logging.getLogger(__name__)
17
+
18
+
19
+ class LanceDBManager:
20
+
21
+ def __init__(self, db_uri="lancedb", embedding_dim=1536):
22
+ self.db = lancedb.connect(db_uri)
23
+ self.embedding_dim = embedding_dim
24
+ logger.info(f"Connected to LanceDB at {db_uri}")
25
+
26
+ def _build_schema(self):
27
+ """Build LanceDB schema with dynamic metadata fields and embedding vector."""
28
+ fields = [
29
+ pa.field("id", pa.int64()),
30
+ pa.field("item_id", pa.string()),
31
+ pa.field("images_urls", pa.string()),
32
+ pa.field("text", pa.string()),
33
+ pa.field("Cluster", pa.string()),
34
+ pa.field("Topic", pa.string()),
35
+ pa.field("embeddings", pa.list_(pa.float32(), self.embedding_dim)),
36
+ pa.field("umap_embeddings", pa.list_(pa.float32(), 2)),
37
+ ]
38
+
39
+ # Add fields from metadata
40
+ for key in metadata_keys:
41
+ sanitized_key = key.split(":")[1].strip().capitalize() # remove the vocabulary prefix in key label and capitalize
42
+ fields.append(pa.field(sanitized_key, pa.string()))
43
+
44
+ return pa.schema(fields)
45
+
46
+ def create_table(self, table_name):
47
+ """Create table using dynamic schema."""
48
+ try:
49
+ schema = self._build_schema()
50
+ table = self.db.create_table(table_name, schema=schema)
51
+ logger.info(f"Created LanceDB table '{table_name}'")
52
+ return table
53
+ except Exception as e:
54
+ logger.error(f"Failed to create table '{table_name}': {e}")
55
+ raise
56
+
57
+ def retrieve_table(self, table_name):
58
+ try:
59
+ table = self.db.open_table(table_name)
60
+ logger.info(f"Opened existing LanceDB table '{table_name}'")
61
+ return table
62
+ except Exception as e:
63
+ logger.error(f"Failed to open table '{table_name}': {e}")
64
+ raise
65
+
66
+ def initialize_table(self, table_name):
67
+ try:
68
+ return self.retrieve_table(table_name)
69
+ except Exception:
70
+ logger.info(f"Table '{table_name}' not found. Creating new.")
71
+ return self.create_table(table_name)
72
+
73
+ def add_entry(self, table_name, items):
74
+ table = self.initialize_table(table_name)
75
+ table.add(items)
76
+ logger.info(f"Added items to table '{table_name}'")
77
+
78
+ def list_tables(self):
79
+ """List all existing tables in the LanceDB instance."""
80
+ try:
81
+ tables = self.db.table_names()
82
+ logger.info("Retrieved list of tables.")
83
+ return tables
84
+ except Exception as e:
85
+ logger.error(f"Failed to list tables: {e}")
86
+ raise
87
+
88
+ def get_content_table(self, table_name):
89
+ table = self.initialize_table(table_name)
90
+ return table.to_pandas()
91
+
92
+ def drop_table(self, table_name):
93
+ """remove an existing table by name."""
94
+ try:
95
+ table = self.db.drop_table(table_name)
96
+ logger.info(f"Remove existing LanceDB table '{table_name}' successfully.")
97
+ return table
98
+ except Exception as e:
99
+ logger.error(f"Failed to remove existing table '{table_name}': {e}")
100
+ raise
requirements.txt ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ pandas
2
+ numpy
3
+ dash
4
+ dash-bootstrap-components
5
+ plotly
6
+ datasets
7
+ pillow
8
+ transformers
9
+ sentence-transformers
10
+ einops
11
+ lancedb
12
+ requests
13
+ umap-learn
14
+ scikit-learn
15
+ hdbscan
16
+ git+https://github.com/gegedenice/omeka-s-api-client.git
17
+ python-dotenv
18
+ gunicorn