Geraldine commited on
Commit
e6c6c2d
·
verified ·
1 Parent(s): 4ca86be

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +618 -157
app.py CHANGED
@@ -1,17 +1,18 @@
1
  import dash
2
- from dash import dcc, html, Input, Output, State, ctx
 
3
  import dash_bootstrap_components as dbc
4
  import plotly.express as px
 
5
  import pandas as pd
6
  import numpy as np
7
  import umap
8
  import hdbscan
9
  import sklearn.feature_extraction.text as text
10
  from dash.exceptions import PreventUpdate
11
- import os
12
  from dotenv import load_dotenv
13
  import helpers
14
- import lancedb
15
  from omeka_s_api_client import OmekaSClient, OmekaSClientError
16
  from lancedb_client import LanceDBManager
17
 
@@ -24,11 +25,12 @@ _DEFAULT_PARSE_METADATA = (
24
  'bibo:annotates','bibo:content', 'bibo:locator', 'bibo:owner'
25
  )
26
 
27
- app = dash.Dash(__name__, external_stylesheets=[dbc.themes.BOOTSTRAP])
28
  app.config.suppress_callback_exceptions = True
29
  server = app.server
30
  manager = LanceDBManager()
31
 
 
32
  french_stopwords = text.ENGLISH_STOP_WORDS.union([
33
  "alors", "au", "aucuns", "aussi", "autre", "avant", "avec", "avoir", "bon",
34
  "car", "ce", "cela", "ces", "ceux", "chaque", "ci", "comme", "comment", "dans",
@@ -46,58 +48,304 @@ french_stopwords = text.ENGLISH_STOP_WORDS.union([
46
  ])
47
 
48
  # -------------------- Layout --------------------
49
- app.layout = dbc.Container([
50
- html.H2("🌍 Omeka S UMAP Explorer", className="text-center mt-4"),
51
- html.Hr(),
52
-
53
- # Input controls
54
- dbc.Row([
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
  dbc.Col([
56
- html.H5("🔍 From Omeka S"),
57
- dcc.Input(id="api-url", value="https://your-omeka-instance.org", type="text", className="form-control"),
58
- dbc.Button("Load Item Sets", id="load-sets", color="secondary", className="mt-2"),
59
- dcc.Dropdown(id="items-sets-dropdown", placeholder="Select a collection"),
60
- dcc.Input(id="table-name", value="my_table", type="text", className="form-control mt-2", placeholder="New table name"),
61
- dbc.Button("Process Omeka Collection", id="load-data", color="primary", className="mt-2"),
62
- ], md=4),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
 
64
- dbc.Col([
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
  html.H5("📁 From LanceDB"),
66
- dbc.Button("Load Existing Tables", id="load-tables", color="info"),
67
  dcc.Dropdown(id="db-tables-dropdown", placeholder="Select an existing table"),
68
- dbc.Button("Display Table", id="load-data-db", color="success", className="mt-2"),
69
- ], md=4),
 
70
 
71
- dbc.Col([
72
- html.H5("🔎 Query Tool (coming soon)"),
73
- dbc.Input(placeholder="Type a search query...", type="text", disabled=True),
74
- ], md=4),
75
- ], className="mb-4"),
76
-
77
- # Main plot area and metadata side panel
78
- dbc.Row([
79
- dbc.Col(
80
- dcc.Graph(id="umap-graph", style={"height": "700px"}),
81
- md=8
82
- ),
83
- dbc.Col(
84
- html.Div(id="point-details", style={
85
- "padding": "15px",
86
- "borderLeft": "1px solid #ccc",
87
- "height": "700px",
88
- "overflowY": "auto"
89
- }),
90
- md=4
91
- ),
92
- ]),
93
 
94
- # Status/info
95
- html.Div(id="status", className="mt-3"),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
 
97
- dcc.Store(id="omeka-client-config", storage_type="session")
98
- ], fluid=True)
99
 
100
- # -------------------- Callbacks --------------------
101
 
102
  @app.callback(
103
  Output("items-sets-dropdown", "options"),
@@ -106,7 +354,9 @@ app.layout = dbc.Container([
106
  State("api-url", "value"),
107
  prevent_initial_call=True
108
  )
109
- def load_item_sets(n, base_url):
 
 
110
  client = OmekaSClient(base_url, "...", "...", 50)
111
  try:
112
  item_sets = client.list_all_item_sets()
@@ -120,108 +370,225 @@ def load_item_sets(n, base_url):
120
  except Exception as e:
121
  return dash.no_update, dash.no_update
122
 
 
123
  @app.callback(
124
- Output("db-tables-dropdown", "options"),
125
  Input("load-tables", "n_clicks"),
126
  prevent_initial_call=True
127
  )
128
- def list_tables(n):
129
- return [{"label": t, "value": t} for t in manager.list_tables()]
 
 
 
130
 
 
131
  @app.callback(
132
  Output("umap-graph", "figure"),
133
  Output("status", "children"),
134
- Input("load-data", "n_clicks"), # From Omeka S
135
- Input("load-data-db", "n_clicks"), # From DB table
136
  State("items-sets-dropdown", "value"),
137
  State("omeka-client-config", "data"),
138
  State("table-name", "value"),
139
- State("db-tables-dropdown", "value"),
140
  prevent_initial_call=True
141
  )
142
- def handle_data_loading(n_clicks_omeka, n_clicks_db, item_set_id, client_config, table_name, db_table):
143
- triggered_id = ctx.triggered_id
144
- print(triggered_id)
145
-
146
- if triggered_id == "load-data": # Omeka S case
147
- if not client_config:
148
- raise PreventUpdate
149
-
150
- client = OmekaSClient(
151
- base_url=client_config["base_url"],
152
- key_identity=client_config["key_identity"],
153
- key_credential=client_config["key_credential"]
154
- )
155
-
156
- df_omeka = harvest_omeka_items(client, item_set_id=item_set_id)
157
- items = df_omeka.to_dict(orient="records")
158
- records_with_text = [helpers.add_concatenated_text_field_exclude_keys(item, keys_to_exclude=['id','images_urls'], text_field_key='text', pair_separator=' - ') for item in items]
159
- df = helpers.prepare_df_atlas(pd.DataFrame(records_with_text), id_col='id', images_col='images_urls')
160
-
161
- text_embed = helpers.generate_text_embed(df['text'].tolist())
162
- img_embed = helpers.generate_img_embed(df['images_urls'].tolist())
163
- embeddings = np.concatenate([text_embed, img_embed], axis=1)
164
- df["embeddings"] = embeddings.tolist()
165
-
166
- reducer = umap.UMAP(n_neighbors=15, min_dist=0.1, metric="cosine")
167
- umap_embeddings = reducer.fit_transform(embeddings)
168
- df["umap_embeddings"] = umap_embeddings.tolist()
169
-
170
- clusterer = hdbscan.HDBSCAN(min_cluster_size=10)
171
- cluster_labels = clusterer.fit_predict(umap_embeddings)
172
- df["Cluster"] = cluster_labels
173
-
174
- vectorizer = text.TfidfVectorizer(max_features=1000, stop_words=list(french_stopwords), lowercase=True)
175
- tfidf_matrix = vectorizer.fit_transform(df["text"].astype(str).tolist())
176
- top_words = []
177
- for label in sorted(df["Cluster"].unique()):
178
- if label == -1:
179
- top_words.append("Noise")
180
- continue
181
- mask = (df["Cluster"] == label).to_numpy().nonzero()[0]
182
- cluster_docs = tfidf_matrix[mask]
183
- mean_tfidf = cluster_docs.mean(axis=0)
184
- mean_tfidf = np.asarray(mean_tfidf).flatten()
185
- top_indices = mean_tfidf.argsort()[::-1][:5]
186
- terms = [vectorizer.get_feature_names_out()[i] for i in top_indices]
187
- top_words.append(", ".join(terms))
188
- cluster_name_map = {label: name for label, name in zip(sorted(df["Cluster"].unique()), top_words)}
189
- df["Topic"] = df["Cluster"].map(cluster_name_map)
190
-
191
- manager.initialize_table(table_name)
192
- manager.add_entry(table_name, df.to_dict(orient="records"))
193
-
194
- elif triggered_id == "load-data-db": # Load existing LanceDB table
195
- if not db_table:
196
- raise PreventUpdate
197
- items = manager.get_content_table(db_table)
198
- df = pd.DataFrame(items)
199
- df = df.dropna(axis=1, how='all')
200
- df = df.fillna('')
201
- #umap_embeddings = np.array(df["umap_embeddings"].tolist())
202
-
203
- else:
204
  raise PreventUpdate
205
 
206
- # Plotting
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
207
  return create_umap_plot(df)
208
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
209
 
 
210
  @app.callback(
211
  Output("point-details", "children"),
212
- Input("umap-graph", "clickData")
213
  )
214
- def show_point_details(clickData):
215
- if not clickData:
216
- return html.Div("🖱️ Click a point to see more details.", style={"color": "#888"})
217
- img_url, title, desc = clickData["points"][0]["customdata"]
218
  return html.Div([
219
- html.H4(title),
220
- html.Img(src=img_url, style={"maxWidth": "100%", "marginBottom": "10px"}),
221
- html.P(desc or "No description available.")
 
 
 
 
 
 
 
 
222
  ])
223
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
224
  # -------------------- Utility --------------------
 
225
 
226
  def harvest_omeka_items(client, item_set_id=None, per_page=50):
227
  """
@@ -235,52 +602,146 @@ def harvest_omeka_items(client, item_set_id=None, per_page=50):
235
  """
236
  print("\n--- Fetching and Parsing Multiple Items by colection---")
237
  try:
238
- # Fetch first 5 items
239
  items_list = client.list_all_items(item_set_id=item_set_id, per_page=per_page)
240
- print(items_list)
241
- print(f"Fetched {len(items_list)} items.")
242
 
243
  parsed_items_list = []
244
- for item_raw in items_list:
245
- if 'o:media' in item_raw:
 
 
 
 
 
246
  parsed = client.digest_item_data(item_raw, prefixes=_DEFAULT_PARSE_METADATA)
247
- if parsed: # Only add if parsing was successful
248
- # Add media
249
- medias_id = [x["o:id"] for x in item_raw["o:media"]]
250
- medias_list = []
251
- for media_id in medias_id:
 
 
 
 
 
 
252
  media = client.get_media(media_id)
253
- if "image" in media["o:media_type"]:
254
- medias_list.append(media.get('o:original_url'))
255
- if medias_list: # Only append if there are image URLs
256
- parsed["images_urls"] = medias_list
257
- parsed_items_list.append(parsed)
258
- print(f"Successfully parsed {len(parsed_items_list)} items.")
259
-
260
- print(f"Successfully parsed {len(parsed_items_list)} items.")
261
- # Note: List columns (like dcterms:title) might need further handling in Pandas
262
- print("\nDataFrame from parsed items:")
263
- return pd.DataFrame(parsed_items_list)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
264
  except OmekaSClientError as e:
265
- print(f"Error fetching/parsing multiple items: {e}")
 
266
  except Exception as e:
267
- print(f"An unexpected error occurred during multi-item parsing: {e}")
 
 
 
 
268
 
269
  def create_umap_plot(df):
270
  coords = np.array(df["umap_embeddings"].tolist())
271
  fig = px.scatter(
272
- df, x=coords[:, 0], y=coords[:, 1],
273
- color="Topic",
274
- custom_data=["images_urls", "Title", "Description"],
 
 
275
  hover_data=None,
276
- title="UMAP Projection with HDBSCAN Topics"
 
 
 
277
  )
 
278
  fig.update_traces(
279
- marker=dict(size=8, line=dict(width=1, color="DarkSlateGrey")),
280
- hovertemplate="<b>%{customdata[1]}</b><br><img src='%{customdata[0]}' height='150'><extra></extra>"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
281
  )
282
- fig.update_layout(height=700, margin=dict(t=30, b=30, l=30, r=30))
283
  return fig, f"Loaded {len(df)} items and projected into 2D."
284
 
285
  if __name__ == "__main__":
286
- app.run(debug=True, port=7860)
 
1
  import dash
2
+ from dash import dcc, html, Input, Output, State, ctx, callback_context
3
+ from dash.exceptions import PreventUpdate
4
  import dash_bootstrap_components as dbc
5
  import plotly.express as px
6
+ import plotly.graph_objects as go
7
  import pandas as pd
8
  import numpy as np
9
  import umap
10
  import hdbscan
11
  import sklearn.feature_extraction.text as text
12
  from dash.exceptions import PreventUpdate
13
+ import json
14
  from dotenv import load_dotenv
15
  import helpers
 
16
  from omeka_s_api_client import OmekaSClient, OmekaSClientError
17
  from lancedb_client import LanceDBManager
18
 
 
25
  'bibo:annotates','bibo:content', 'bibo:locator', 'bibo:owner'
26
  )
27
 
28
+ app = dash.Dash(__name__, suppress_callback_exceptions=True, external_stylesheets=[dbc.themes.BOOTSTRAP])
29
  app.config.suppress_callback_exceptions = True
30
  server = app.server
31
  manager = LanceDBManager()
32
 
33
+
34
  french_stopwords = text.ENGLISH_STOP_WORDS.union([
35
  "alors", "au", "aucuns", "aussi", "autre", "avant", "avec", "avoir", "bon",
36
  "car", "ce", "cela", "ces", "ceux", "chaque", "ci", "comme", "comment", "dans",
 
48
  ])
49
 
50
  # -------------------- Layout --------------------
51
+ app.layout = html.Div([
52
+ # Header
53
+ dbc.NavbarSimple(
54
+ children=[],
55
+ brand="Omeka S Computer Vision Asistant",
56
+ brand_href="/",
57
+ color="light",
58
+ dark=False,
59
+ className="mb-4 shadow-sm border-bottom"
60
+ ),
61
+
62
+ # Main Container
63
+ dbc.Container(fluid=True, children=[
64
+ dbc.Row([
65
+ # Left column - Controls
66
+ dbc.Col(width=6, children=[
67
+ dbc.Card([
68
+ dbc.CardHeader(html.H4("Data Loading and ploting", className="text-center")),
69
+ dbc.CardBody([
70
+
71
+ # Tabs
72
+ dcc.Tabs(id="data-tabs", value="api", children=[
73
+ dcc.Tab(label="🔍 From Omeka S", value="omeka"),
74
+ dcc.Tab(label="📁 From LanceDB", value="lance")
75
+ ]),
76
+
77
+ html.Div(id="data-tab-content"),
78
+
79
+ html.Br(),
80
+ ])
81
+ ], className="mb-4 shadow-sm")
82
+ ]),
83
+ # Right column - Explanations
84
+ dbc.Col(width=6, children=[
85
+ dbc.Card([
86
+ dbc.CardHeader(
87
+ html.H4(
88
+ dbc.Button("Explanations", color="primary", id="explanation-toggle", n_clicks=0),
89
+ className="text-center"
90
+ )
91
+ ),
92
+ dbc.Collapse(
93
+ dbc.CardBody([
94
+ html.P("This application allows you to explore Omeka S collections through interactive visualization."),
95
+ html.P("You can load data in two ways:"),
96
+ html.P("1. From Omeka S: Connect to your Omeka S instance and select a collection to visualize."),
97
+ html.P("2. From LanceDB: Load previously processed collections from the local database."),
98
+ html.P("The visualization uses UMAP projection and topic clustering to create an interactive map of your collection."),
99
+ html.P("You can explore items by hovering over points and search using semantic queries."),
100
+ ]),
101
+ id="explanation-collapse",
102
+ is_open=False
103
+ )
104
+ ], className="mb-4 shadow-sm")
105
+ ])
106
+ ]),
107
+
108
+ html.Br(),
109
+ dbc.Row([
110
+ dbc.Col([
111
+ dbc.InputGroup([
112
+ dbc.Input(
113
+ id="search-input",
114
+ type="text",
115
+ placeholder="Search...",
116
+ ),
117
+ dbc.Button(
118
+ "Search",
119
+ id="search-button",
120
+ color="primary",
121
+ size="sm",
122
+ ),
123
+ dbc.Button(
124
+ "Clear",
125
+ id="clear-button",
126
+ color="secondary",
127
+ size="sm",
128
+ ),
129
+ ], className="d-flex align-items-center")
130
+ ], width={"size": 6, "offset": 3}), # Center the input group and make it half width
131
+ ], className="mb-3"),
132
+ dbc.Row([
133
  dbc.Col([
134
+ html.Label("Number of results:", className="mb-0"),
135
+ dcc.Slider(
136
+ id="search-limit-slider",
137
+ min=1,
138
+ max=50,
139
+ step=1,
140
+ value=5,
141
+ marks={i: str(i) for i in range(1, 51, 1)},
142
+ className="mt-1"
143
+ ),
144
+ ], width={"size": 6, "offset": 3}),
145
+ ], className="mb-3"),
146
+ html.Br(),
147
+ # Central Visualization (like scatter plot, map etc.)
148
+ dbc.Row([
149
+ html.Div([
150
+ dbc.Spinner(
151
+ id="loading-spinner",
152
+ type="grow",
153
+ color="primary",
154
+ fullscreen=False,
155
+ children=[
156
+ # Add a placeholder div
157
+ html.Div(
158
+ id="graph-placeholder",
159
+ children="Select a data source and load data to visualize",
160
+ style={
161
+ "height": "700px",
162
+ "display": "flex",
163
+ "alignItems": "center",
164
+ "justifyContent": "center",
165
+ "color": "#666",
166
+ "fontSize": "1.2rem",
167
+ "fontStyle": "italic",
168
+ "width": "900px" # Set width to 70%
169
+ }
170
+ ),
171
+ dcc.Graph(
172
+ id="umap-graph",
173
+ style={
174
+ "width": "900px", # Set width to 70%
175
+ "height": "700px",
176
+ "display": "none"
177
+ },
178
+ config={
179
+ 'scrollZoom': True,
180
+ 'displayModeBar': True,
181
+ 'modeBarButtonsToAdd': ['drawline']
182
+ }
183
+ )],
184
+ ),
185
+ html.Div(id="point-details",
186
+ style={
187
+ "width": "30%", # Set width to 30%
188
+ "padding": "15px",
189
+ "borderLeft": "1px solid #ccc",
190
+ "overflowY": "auto",
191
+ "height": "700px",
192
+ "minWidth": "250px",
193
+ "maxWidth": "30%" # Match the width
194
+ }),
195
+ ],
196
+ style={
197
+ "display": "flex",
198
+ "flexDirection": "row",
199
+ "width": "100%",
200
+ "gap": "10px",
201
+ "justifyContent": "space-between"
202
+ }),
203
+ ]),
204
+ html.Div(id="status"),
205
+ dcc.Store(id="omeka-client-config", storage_type="session"),
206
+ ]),
207
+
208
+ # Footer
209
+ html.Footer([
210
+ html.Hr(),
211
+ dbc.Container([
212
+ dbc.Row([
213
+ dbc.Col([
214
+ html.Img(src="SmartBibl.IA_Solutions.png", height="50"),
215
+ html.Small([
216
+ html.Br(),
217
+ html.A("Géraldine Geoffroy", href="mailto:grldn.geoffroy@gmail.com", className="text-muted")
218
+ ])
219
+ ]),
220
+ dbc.Col([
221
+ html.H5("Code source"),
222
+ html.Ul([
223
+ html.Li(html.A("Github", href="https://github.com/gegedenice/openalex-explorer", className="text-muted", target="_blank"))
224
+ ])
225
+ ]),
226
+ dbc.Col([
227
+ html.H5("Ressources"),
228
+ html.Ul([
229
+ html.Li(html.A("Nomic Atlas", href="https://atlas.nomic.ai/", target="_blank", className="text-muted")),
230
+ html.Li(html.A("Model nomic-embed-text-v1.5", href="https://huggingface.co/nomic-ai/nomic-embed-text-v1.5", target="_blank", className="text-muted")),
231
+ html.Li(html.A("Model nomic-embed-vision-v1.5", href="https://huggingface.co/nomic-ai/nomic-embed-vision-v1.5", target="_blank", className="text-muted"))
232
+ ])
233
+ ])
234
+ ])
235
+ ])
236
+ ], className="mt-5 p-3 bg-light border-top")
237
+ ])
238
 
239
+ # -------------------- UI Callbacks --------------------
240
+ # ------------------------------------------------------
241
+
242
+ ##-------------------- Tabs Callbacks --------------------
243
+ @app.callback(
244
+ Output("data-tab-content", "children"),
245
+ Input("data-tabs", "value")
246
+ )
247
+ def render_tab_content(tab):
248
+ if tab == "omeka":
249
+ return html.Div([
250
+ html.Div([
251
+ html.H5("🔍 From Omeka S", className="mb-3"),
252
+ # API URL input with full width
253
+ dbc.InputGroup([
254
+ dbc.Input(
255
+ id="api-url",
256
+ value="https://your-omeka-instance.org",
257
+ type="url",
258
+ placeholder="Enter your Omeka S instance URL",
259
+ className="mb-2"
260
+ ),
261
+ ]),
262
+ # Buttons and dropdowns container
263
+ dbc.Container([
264
+ dbc.Row([
265
+ dbc.Col([
266
+ dbc.Button(
267
+ "Load Item Sets",
268
+ id="load-sets",
269
+ color="link",
270
+ size="sm",
271
+ className="w-100 mb-2"
272
+ ),
273
+ ]),
274
+ ]),
275
+ dbc.Row([
276
+ dbc.Col([
277
+ dcc.Dropdown(
278
+ id="items-sets-dropdown",
279
+ placeholder="Select a collection",
280
+ className="mb-2"
281
+ ),
282
+ ]),
283
+ ]),
284
+ dbc.Row([
285
+ dbc.Col([
286
+ dbc.Input(
287
+ id="table-name",
288
+ value="Enter a table name for data storage",
289
+ type="text",
290
+ placeholder="New table name",
291
+ className="mb-2"
292
+ ),
293
+ ]),
294
+ ]),
295
+ dbc.Row([
296
+ dbc.Col([
297
+ dbc.Button(
298
+ "Process Omeka Collection",
299
+ id="process-omeka",
300
+ color="success",
301
+ size="sm",
302
+ className="mt-2"
303
+ ),
304
+ ]),
305
+ ]),
306
+ ], fluid=True, className="p-0"),
307
+ ], className="p-3"),
308
+ ], className="border rounded bg-white shadow-sm")
309
+ elif tab == "lance":
310
+ return html.Div([
311
  html.H5("📁 From LanceDB"),
312
+ dbc.Button("Load LanceDB tables", id="load-tables", color="link", size="sm", className="mt-2"),
313
  dcc.Dropdown(id="db-tables-dropdown", placeholder="Select an existing table"),
314
+ dbc.Button("Display Table", id="load-data-db", color="success", size="sm", className="mt-2"),
315
+ dbc.Button("Drop Table", id="drop-data-db", color="danger", size="sm", className="mt-2"),
316
+ ])
317
 
318
+ return html.Div("Invalid tab selected.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
319
 
320
+ # -------------------- Collpase callback --------------------
321
+ @app.callback(
322
+ Output("explanation-collapse", "is_open"),
323
+ Input("explanation-toggle", "n_clicks"),
324
+ prevent_initial_call=True
325
+ )
326
+ def toggle_collapse(n):
327
+ return n % 2 == 1
328
+
329
+ # -------------------- Graph placeholder Toggle callback --------------------
330
+ @app.callback(
331
+ Output("graph-placeholder", "style"),
332
+ Output("umap-graph", "style"),
333
+ [Input("umap-graph", "figure")],
334
+ prevent_initial_call=True
335
+ )
336
+ def toggle_graph_visibility(figure):
337
+ if figure is None:
338
+ return {"display": "flex"}, {"display": "none"}
339
+ return {"display": "none"}, {
340
+ "flex": 3,
341
+ "width": "100%",
342
+ "display": "block"
343
+ }
344
 
345
+ # -------------------- Features Callbacks --------------------
346
+ # ------------------------------------------------------------
347
 
348
+ ## -------------------- Load Omeka collections callback--------------------
349
 
350
  @app.callback(
351
  Output("items-sets-dropdown", "options"),
 
354
  State("api-url", "value"),
355
  prevent_initial_call=True
356
  )
357
+ def load_item_sets(n_clicks, base_url):
358
+ if n_clicks is None: # Add this check
359
+ raise PreventUpdate
360
  client = OmekaSClient(base_url, "...", "...", 50)
361
  try:
362
  item_sets = client.list_all_item_sets()
 
370
  except Exception as e:
371
  return dash.no_update, dash.no_update
372
 
373
+ ## -------------------- Load LanceDB tables callback--------------------
374
  @app.callback(
375
+ Output("db-tables-dropdown", "options", allow_duplicate=True),
376
  Input("load-tables", "n_clicks"),
377
  prevent_initial_call=True
378
  )
379
+ def list_tables(n_clicks):
380
+ if not n_clicks:
381
+ raise PreventUpdate
382
+ tables = manager.list_tables()
383
+ return [{"label": t, "value": t} for t in tables]
384
 
385
+ ## -------------------- Load & Process Omeka items callback--------------------
386
  @app.callback(
387
  Output("umap-graph", "figure"),
388
  Output("status", "children"),
389
+ Input("process-omeka", "n_clicks"), # Changed ID to match new button
 
390
  State("items-sets-dropdown", "value"),
391
  State("omeka-client-config", "data"),
392
  State("table-name", "value"),
 
393
  prevent_initial_call=True
394
  )
395
+ def handle_omeka_data(n_clicks, item_set_id, client_config, table_name):
396
+ if not n_clicks or not client_config:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
397
  raise PreventUpdate
398
 
399
+ client = OmekaSClient(
400
+ base_url=client_config["base_url"],
401
+ key_identity=client_config["key_identity"],
402
+ key_credential=client_config["key_credential"]
403
+ )
404
+
405
+ df_omeka = harvest_omeka_items(client, item_set_id=item_set_id)
406
+ items = df_omeka.to_dict(orient="records")
407
+ records_with_text = [helpers.add_concatenated_text_field_exclude_keys(item, keys_to_exclude=['id','images_urls'], text_field_key='text', pair_separator=' - ') for item in items]
408
+ df = helpers.prepare_df_atlas(pd.DataFrame(records_with_text), id_col='id', images_col='images_urls')
409
+
410
+ text_embed = helpers.generate_text_embed(df['text'].tolist())
411
+ img_embed = helpers.generate_img_embed(df['images_urls'].tolist())
412
+ embeddings = (text_embed + img_embed) / 2 # Average the embeddings
413
+ df["embeddings"] = embeddings.tolist()
414
+
415
+ reducer = umap.UMAP(n_neighbors=15, min_dist=0.1, n_components=2, metric='cosine', random_state=42)
416
+ umap_embeddings = reducer.fit_transform(embeddings)
417
+ df["umap_embeddings"] = umap_embeddings.tolist()
418
+
419
+ clusterer = hdbscan.HDBSCAN(min_cluster_size=10)
420
+ cluster_labels = clusterer.fit_predict(umap_embeddings)
421
+ df["Cluster"] = cluster_labels
422
+
423
+ vectorizer = text.TfidfVectorizer(max_features=1000, stop_words=list(french_stopwords), lowercase=True)
424
+ tfidf_matrix = vectorizer.fit_transform(df["text"].astype(str).tolist())
425
+ top_words = []
426
+ for label in sorted(df["Cluster"].unique()):
427
+ if label == -1:
428
+ top_words.append("Noise")
429
+ continue
430
+ mask = (df["Cluster"] == label).to_numpy().nonzero()[0]
431
+ cluster_docs = tfidf_matrix[mask]
432
+ mean_tfidf = cluster_docs.mean(axis=0)
433
+ mean_tfidf = np.asarray(mean_tfidf).flatten()
434
+ top_indices = mean_tfidf.argsort()[::-1][:5]
435
+ terms = [vectorizer.get_feature_names_out()[i] for i in top_indices]
436
+ top_words.append(", ".join(terms))
437
+ cluster_name_map = {label: name for label, name in zip(sorted(df["Cluster"].unique()), top_words)}
438
+ df["Topic"] = df["Cluster"].map(cluster_name_map)
439
+
440
+ manager.initialize_table(table_name)
441
+ manager.add_entry(table_name, df.to_dict(orient="records"))
442
+
443
  return create_umap_plot(df)
444
 
445
+ ## -------------------- Load LanceDB data callback--------------------
446
+ @app.callback(
447
+ Output("umap-graph", "figure", allow_duplicate=True),
448
+ Output("status", "children", allow_duplicate=True),
449
+ Input("load-data-db", "n_clicks"),
450
+ State("db-tables-dropdown", "value"),
451
+ prevent_initial_call=True
452
+ )
453
+ def handle_db_data(n_clicks, db_table):
454
+ if not n_clicks or not db_table:
455
+ raise PreventUpdate
456
+
457
+ items = manager.get_content_table(db_table)
458
+ df = pd.DataFrame(items)
459
+ df = df.dropna(axis=1, how='all')
460
+ df = df.fillna('')
461
+ #umap_embeddings = np.array(df["umap_embeddings"].tolist())
462
+ return create_umap_plot(df)
463
 
464
+ ## -------------------- plotly Hover datapoint callback--------------------
465
  @app.callback(
466
  Output("point-details", "children"),
467
+ Input("umap-graph", "hoverData")
468
  )
469
+ def show_point_details(hoverData):
470
+ if not hoverData:
471
+ return html.Div("🖱️ Hover a point to see more details.", style={"color": "#888"})
472
+ id,item_id, img_url, title, desc = hoverData["points"][0]["customdata"]
473
  return html.Div([
474
+ html.H4(title, style={"fontSize": "1.2rem"}), # Reduced header size
475
+ html.P(f"Item ID: {item_id}", style={"fontSize": "0.9rem", "color": "#666"}), # Smaller text
476
+ html.Img(src=img_url, style={
477
+ "maxWidth": "300px", # Fixed max width instead of 100%
478
+ "height": "auto", # Maintain aspect ratio
479
+ "marginBottom": "10px",
480
+ "borderRadius": "5px",
481
+ "boxShadow": "0 2px 4px rgba(0,0,0,0.1)"
482
+ }),
483
+ html.P(desc or "No description available.",
484
+ style={"lineHeight": "1.6", "color": "#444", "fontSize": "0.9rem"}) # Smaller text
485
  ])
486
 
487
+ ## -------------------- Search & filter datapoint callback--------------------
488
+ @app.callback(
489
+ Output("umap-graph", "figure", allow_duplicate=True),
490
+ Input("search-button", "n_clicks"),
491
+ Input("search-limit-slider", "value"), # Add slider input
492
+ State("search-input", "value"),
493
+ State("db-tables-dropdown", "value"),
494
+ State("umap-graph", "figure"),
495
+ prevent_initial_call=True
496
+ )
497
+ def filter_points(n_clicks, limit, search_query, table, current_fig):
498
+ # Get the trigger that caused the callback
499
+ trigger = ctx.triggered_id
500
+
501
+ # If slider changed but no search query exists, don't update
502
+ if trigger == "search-limit-slider" and not search_query:
503
+ return dash.no_update
504
+
505
+ if not search_query:
506
+ # Reset visibility of all points
507
+ for trace in current_fig['data']:
508
+ trace['visible'] = True
509
+ return current_fig
510
+
511
+ # Generate text embedding
512
+ query_embed = helpers.generate_text_embed([f"search_query: {search_query}"]).tolist()
513
+
514
+ # Perform semantic search using the slider value
515
+ matching = manager.semantic_search(
516
+ table_name=table,
517
+ query_embed=query_embed,
518
+ limit=limit # Use the slider value
519
+ )
520
+
521
+ matching_ids = [item['id'] for item in json.loads(matching)]
522
+ print(f"Searching for '{search_query}' with limit {limit}")
523
+ print(f"Found {len(matching_ids)} matches")
524
+
525
+ # Update visibility of points
526
+ fig = go.Figure(current_fig)
527
+ for trace in fig.data:
528
+ point_ids = [point[0] for point in trace['customdata']]
529
+ selected_indices = [i for i, id in enumerate(point_ids) if id in matching_ids]
530
+ trace.update(
531
+ selectedpoints=selected_indices,
532
+ unselected=dict(marker=dict(opacity=0.1))
533
+ )
534
+
535
+ return fig
536
+
537
+ ## -------------------- Clear search callback--------------------
538
+ @app.callback(
539
+ Output("umap-graph", "figure", allow_duplicate=True),
540
+ Output("search-input", "value"), # Clear the search input
541
+ Input("clear-button", "n_clicks"),
542
+ State("umap-graph", "figure"),
543
+ prevent_initial_call=True
544
+ )
545
+ def clear_search(n_clicks, current_fig):
546
+ if not n_clicks:
547
+ raise PreventUpdate
548
+
549
+ fig = go.Figure(current_fig)
550
+
551
+ # Reset all points to visible and full opacity
552
+ for trace in fig.data:
553
+ trace.update(
554
+ selectedpoints=None,
555
+ unselected=None,
556
+ opacity=0.8
557
+ )
558
+
559
+ return fig, "" # Return cleared figure and empty search input
560
+
561
+ ## -------------------- Load LanceDB data callback--------------------
562
+ @app.callback(
563
+ Output("db-tables-dropdown", "options",allow_duplicate=True), # Update dropdown options
564
+ Output("status", "children",allow_duplicate=True), # Show status message
565
+ Output("db-tables-dropdown", "value",allow_duplicate=True), # Clear current selection
566
+ Input("drop-data-db", "n_clicks"),
567
+ State("db-tables-dropdown", "value"),
568
+ prevent_initial_call=True
569
+ )
570
+ def drop_db_data(n_clicks, db_table):
571
+ if not n_clicks or not db_table:
572
+ raise PreventUpdate
573
+
574
+ try:
575
+ # Delete the table
576
+ success = manager.drop_table(db_table)
577
+
578
+ if success:
579
+ # Get updated list of tables
580
+ tables = manager.list_tables()
581
+ options = [{"label": t, "value": t} for t in tables]
582
+ return options, f"Table '{db_table}' successfully deleted", None
583
+ else:
584
+ return dash.no_update, f"Failed to delete table '{db_table}'", dash.no_update
585
+
586
+ except Exception as e:
587
+ print(f"Error dropping table: {str(e)}")
588
+ return dash.no_update, f"Error: {str(e)}", dash.no_update
589
+
590
  # -------------------- Utility --------------------
591
+ # -------------------------------------------------
592
 
593
  def harvest_omeka_items(client, item_set_id=None, per_page=50):
594
  """
 
602
  """
603
  print("\n--- Fetching and Parsing Multiple Items by colection---")
604
  try:
605
+ # Fetch items
606
  items_list = client.list_all_items(item_set_id=item_set_id, per_page=per_page)
607
+ print(f"Initial fetch: {len(items_list)} items")
 
608
 
609
  parsed_items_list = []
610
+ for idx, item_raw in enumerate(items_list):
611
+ try:
612
+ print(f"\nProcessing item {idx + 1}/{len(items_list)}")
613
+ if 'o:media' not in item_raw:
614
+ print(f"Skipping item {idx + 1}: No media found")
615
+ continue
616
+
617
  parsed = client.digest_item_data(item_raw, prefixes=_DEFAULT_PARSE_METADATA)
618
+ if not parsed:
619
+ print(f"Skipping item {idx + 1}: Parsing failed")
620
+ continue
621
+
622
+ # Debug media processing
623
+ medias_id = [x["o:id"] for x in item_raw["o:media"]]
624
+ print(f"Found {len(medias_id)} media items")
625
+
626
+ medias_list = []
627
+ for media_id in medias_id:
628
+ try:
629
  media = client.get_media(media_id)
630
+ print(f"Media type: {media.get('o:media_type', 'unknown')}")
631
+ if "image" in media.get("o:media_type", ""):
632
+ url = media.get('o:original_url')
633
+ if url:
634
+ medias_list.append(url)
635
+ else:
636
+ print(f"No URL found for media {media_id}")
637
+ except Exception as e:
638
+ print(f"Error processing media {media_id}: {str(e)}")
639
+
640
+ if medias_list:
641
+ parsed["images_urls"] = medias_list
642
+ parsed_items_list.append(parsed)
643
+ print(f"Added item with {len(medias_list)} images")
644
+ else:
645
+ print(f"Skipping item {idx + 1}: No valid image URLs found")
646
+
647
+ except Exception as e:
648
+ print(f"Error processing item {idx + 1}: {str(e)}")
649
+ print(f"Item raw data: {item_raw}")
650
+ continue
651
+
652
+ if not parsed_items_list:
653
+ print("No valid items were parsed!")
654
+ return None
655
+
656
+ print(f"\nFinal results:")
657
+ print(f"Total items processed: {len(items_list)}")
658
+ print(f"Successfully parsed items: {len(parsed_items_list)}")
659
+
660
+ df = pd.DataFrame(parsed_items_list)
661
+ print(f"DataFrame columns: {df.columns.tolist()}")
662
+ print(f"DataFrame shape: {df.shape}")
663
+ return df
664
+
665
  except OmekaSClientError as e:
666
+ print(f"Omeka client error: {str(e)}")
667
+ return None
668
  except Exception as e:
669
+ print(f"Unexpected error: {str(e)}")
670
+ print(f"Error type: {type(e)}")
671
+ import traceback
672
+ print(f"Traceback:\n{traceback.format_exc()}")
673
+ return None
674
 
675
  def create_umap_plot(df):
676
  coords = np.array(df["umap_embeddings"].tolist())
677
  fig = px.scatter(
678
+ df,
679
+ x=coords[:, 0],
680
+ y=coords[:, 1],
681
+ color="Topic", # Start with top-level topics
682
+ custom_data=[df["id"], df["item_id"], df["images_urls"], df["Title"], df["Description"]],
683
  hover_data=None,
684
+ title="UMAP Projection with HDBSCAN Topics",
685
+ color_discrete_sequence=px.colors.qualitative.D3,
686
+ width=900,
687
+ height=700,
688
  )
689
+ # Update marker style
690
  fig.update_traces(
691
+ marker=dict(
692
+ size=12, # Larger points
693
+ opacity=0.8, # Slight transparency
694
+ line=dict(width=0), # Remove borders
695
+ symbol='circle'
696
+ ),
697
+ hoverinfo='none', # Disable native hover
698
+ hovertemplate=None
699
+ #hovertemplate="<b>%{customdata[1]}</b><br><img src='%{customdata[0]}' height='150'><extra></extra>"
700
+ )
701
+
702
+ # Convert to a go.Figure object to access additional configuration
703
+ fig = go.Figure(fig)
704
+
705
+ # Update layout including scroll zoom
706
+ fig.update_layout(
707
+ plot_bgcolor='white',
708
+ paper_bgcolor='white',
709
+ height=700,
710
+ margin=dict(t=30, b=30, l=30, r=30),
711
+ showlegend=False,
712
+ legend=dict(
713
+ yanchor="top",
714
+ y=0.99,
715
+ xanchor="right",
716
+ x=0.99,
717
+ bgcolor='rgba(255,255,255,0.8)',
718
+ bordercolor='rgba(0,0,0,0)'
719
+ ),
720
+ xaxis=dict(
721
+ showgrid=False,
722
+ zeroline=False,
723
+ showline=False,
724
+ showticklabels=False,
725
+ fixedrange=False
726
+ ),
727
+ yaxis=dict(
728
+ showgrid=False,
729
+ zeroline=False,
730
+ showline=False,
731
+ showticklabels=False,
732
+ fixedrange=False
733
+ ),
734
+ dragmode='pan',
735
+ modebar_add=[
736
+ 'zoom',
737
+ 'pan',
738
+ 'zoomIn',
739
+ 'zoomOut',
740
+ 'resetScale'
741
+ ],
742
  )
743
+
744
  return fig, f"Loaded {len(df)} items and projected into 2D."
745
 
746
  if __name__ == "__main__":
747
+ app.run(port=7860)