Spaces:

Geraldine
/

omeka-s-computer-vision

Sleeping

App Files Files Community

Geraldine commited on Apr 25

Commit

97226b8

verified ·

1 Parent(s): 2d31bce

Upload 6 files

Browse files

Files changed (6) hide show

.env +2 -0
Dockerfile +12 -0
app.py +286 -0
helpers.py +146 -0
lancedb_client.py +100 -0
requirements.txt +18 -0

.env ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ _DEFAULT_PARSE_METADATA=('dcterms:identifier','dcterms:type','dcterms:title', 'dcterms:description','dcterms:creator','dcterms:publisher','dcterms:date','dcterms:spatial','dcterms:format','dcterms:provenance','dcterms:subject','dcterms:medium','bibo:annotates','bibo:content', 'bibo:locator', 'bibo:owner')
2	+ HF_TOKEN=""

Dockerfile ADDED Viewed

	@@ -0,0 +1,12 @@

+# syntax=docker/dockerfile:1
+FROM python:3.11-slim
+RUN useradd -m -u 1000 user
+USER user
+ENV PATH="/home/user/.local/bin:$PATH"
+WORKDIR /app
+COPY --chown=user ./requirements.txt requirements.txt
+RUN pip install --upgrade pip
+RUN pip install --no-cache-dir --upgrade -r requirements.txt
+COPY --chown=user . /app
+EXPOSE 8050
+CMD gunicorn --workers 5 --threads 2 -b 0.0.0.0:8050 --timeout 0 app:server

app.py ADDED Viewed

	@@ -0,0 +1,286 @@

+import dash
+from dash import dcc, html, Input, Output, State, ctx
+import dash_bootstrap_components as dbc
+import plotly.express as px
+import pandas as pd
+import numpy as np
+import umap
+import hdbscan
+import sklearn.feature_extraction.text as text
+from dash.exceptions import PreventUpdate
+import os
+from dotenv import load_dotenv
+import helpers
+import lancedb
+from omeka_s_api_client import OmekaSClient, OmekaSClientError
+from lancedb_client import LanceDBManager
+# Load .env for credentials
+load_dotenv()
+_DEFAULT_PARSE_METADATA = (
+    'dcterms:identifier','dcterms:type','dcterms:title', 'dcterms:description',
+    'dcterms:creator','dcterms:publisher','dcterms:date','dcterms:spatial',
+    'dcterms:format','dcterms:provenance','dcterms:subject','dcterms:medium',
+    'bibo:annotates','bibo:content', 'bibo:locator', 'bibo:owner'
+)
+app = dash.Dash(__name__, external_stylesheets=[dbc.themes.BOOTSTRAP])
+app.config.suppress_callback_exceptions = True
+server = app.server
+manager = LanceDBManager()
+french_stopwords = text.ENGLISH_STOP_WORDS.union([
+    "alors", "au", "aucuns", "aussi", "autre", "avant", "avec", "avoir", "bon",
+    "car", "ce", "cela", "ces", "ceux", "chaque", "ci", "comme", "comment", "dans",
+    "des", "du", "dedans", "dehors", "depuis", "devrait", "doit", "donc", "dos",
+    "début", "elle", "elles", "en", "encore", "essai", "est", "et", "eu", "fait",
+    "faites", "fois", "font", "hors", "ici", "il", "ils", "je", "juste", "la", "le",
+    "les", "leur", "là", "ma", "maintenant", "mais", "mes", "mine", "moins", "mon",
+    "mot", "même", "ni", "nommés", "notre", "nous", "nouveaux", "ou", "où", "par",
+    "parce", "parole", "pas", "personnes", "peut", "peu", "pièce", "plupart", "pour",
+    "pourquoi", "quand", "que", "quel", "quelle", "quelles", "quels", "qui", "sa",
+    "sans", "ses", "seulement", "si", "sien", "son", "sont", "sous", "soyez", "sujet",
+    "sur", "ta", "tandis", "tellement", "tels", "tes", "ton", "tous", "tout", "trop",
+    "très", "tu", "valeur", "voie", "voient", "vont", "votre", "vous", "vu", "ça",
+    "étaient", "état", "étions", "été", "être"
+])
+# -------------------- Layout --------------------
+app.layout = dbc.Container([
+    html.H2("🌍 Omeka S UMAP Explorer", className="text-center mt-4"),
+    html.Hr(),
+    # Input controls
+    dbc.Row([
+        dbc.Col([
+            html.H5("🔍 From Omeka S"),
+            dcc.Input(id="api-url", value="https://your-omeka-instance.org", type="text", className="form-control"),
+            dbc.Button("Load Item Sets", id="load-sets", color="secondary", className="mt-2"),
+            dcc.Dropdown(id="items-sets-dropdown", placeholder="Select a collection"),
+            dcc.Input(id="table-name", value="my_table", type="text", className="form-control mt-2", placeholder="New table name"),
+            dbc.Button("Process Omeka Collection", id="load-data", color="primary", className="mt-2"),
+        ], md=4),
+        dbc.Col([
+            html.H5("📁 From LanceDB"),
+            dbc.Button("Load Existing Tables", id="load-tables", color="info"),
+            dcc.Dropdown(id="db-tables-dropdown", placeholder="Select an existing table"),
+            dbc.Button("Display Table", id="load-data-db", color="success", className="mt-2"),
+        ], md=4),
+        dbc.Col([
+            html.H5("🔎 Query Tool (coming soon)"),
+            dbc.Input(placeholder="Type a search query...", type="text", disabled=True),
+        ], md=4),
+    ], className="mb-4"),
+    # Main plot area and metadata side panel
+    dbc.Row([
+        dbc.Col(
+            dcc.Graph(id="umap-graph", style={"height": "700px"}),
+            md=8
+        ),
+        dbc.Col(
+            html.Div(id="point-details", style={
+                "padding": "15px",
+                "borderLeft": "1px solid #ccc",
+                "height": "700px",
+                "overflowY": "auto"
+            }),
+            md=4
+        ),
+    ]),
+    # Status/info
+    html.Div(id="status", className="mt-3"),
+    dcc.Store(id="omeka-client-config", storage_type="session")
+], fluid=True)
+# -------------------- Callbacks --------------------
+@app.callback(
+    Output("items-sets-dropdown", "options"),
+    Output("omeka-client-config", "data"),
+    Input("load-sets", "n_clicks"),
+    State("api-url", "value"),
+    prevent_initial_call=True
+)
+def load_item_sets(n, base_url):
+    client = OmekaSClient(base_url, "...", "...", 50)
+    try:
+        item_sets = client.list_all_item_sets()
+        options = [{"label": s.get('dcterms:title', [{}])[0].get('@value', 'N/A'), "value": s["o:id"]} for s in item_sets]
+        return options, {
+            "base_url": base_url,
+            "key_identity": "...",
+            "key_credential": "...",
+            "default_per_page": 50
+        }
+    except Exception as e:
+        return dash.no_update, dash.no_update
+@app.callback(
+    Output("db-tables-dropdown", "options"),
+    Input("load-tables", "n_clicks"),
+    prevent_initial_call=True
+)
+def list_tables(n):
+    return [{"label": t, "value": t} for t in manager.list_tables()]
+@app.callback(
+    Output("umap-graph", "figure"),
+    Output("status", "children"),
+    Input("load-data", "n_clicks"),  # From Omeka S
+    Input("load-data-db", "n_clicks"),  # From DB table
+    State("items-sets-dropdown", "value"),
+    State("omeka-client-config", "data"),
+    State("table-name", "value"),
+    State("db-tables-dropdown", "value"),
+    prevent_initial_call=True
+)
+def handle_data_loading(n_clicks_omeka, n_clicks_db, item_set_id, client_config, table_name, db_table):
+    triggered_id = ctx.triggered_id
+    print(triggered_id)
+    if triggered_id == "load-data":  # Omeka S case
+        if not client_config:
+            raise PreventUpdate
+        client = OmekaSClient(
+            base_url=client_config["base_url"],
+            key_identity=client_config["key_identity"],
+            key_credential=client_config["key_credential"]
+        )
+        df_omeka = harvest_omeka_items(client, item_set_id=item_set_id)
+        items = df_omeka.to_dict(orient="records")
+        records_with_text = [helpers.add_concatenated_text_field_exclude_keys(item, keys_to_exclude=['id','images_urls'], text_field_key='text', pair_separator=' - ') for item in items]
+        df = helpers.prepare_df_atlas(pd.DataFrame(records_with_text), id_col='id', images_col='images_urls')
+        text_embed = helpers.generate_text_embed(df['text'].tolist())
+        img_embed = helpers.generate_img_embed(df['images_urls'].tolist())
+        embeddings = np.concatenate([text_embed, img_embed], axis=1)
+        df["embeddings"] = embeddings.tolist()
+        reducer = umap.UMAP(n_neighbors=15, min_dist=0.1, metric="cosine")
+        umap_embeddings = reducer.fit_transform(embeddings)
+        df["umap_embeddings"] = umap_embeddings.tolist()
+        clusterer = hdbscan.HDBSCAN(min_cluster_size=10)
+        cluster_labels = clusterer.fit_predict(umap_embeddings)
+        df["Cluster"] = cluster_labels
+        vectorizer = text.TfidfVectorizer(max_features=1000, stop_words=list(french_stopwords), lowercase=True)
+        tfidf_matrix = vectorizer.fit_transform(df["text"].astype(str).tolist())
+        top_words = []
+        for label in sorted(df["Cluster"].unique()):
+            if label == -1:
+                top_words.append("Noise")
+                continue
+            mask = (df["Cluster"] == label).to_numpy().nonzero()[0]
+            cluster_docs = tfidf_matrix[mask]
+            mean_tfidf = cluster_docs.mean(axis=0)
+            mean_tfidf = np.asarray(mean_tfidf).flatten()
+            top_indices = mean_tfidf.argsort()[::-1][:5]
+            terms = [vectorizer.get_feature_names_out()[i] for i in top_indices]
+            top_words.append(", ".join(terms))
+        cluster_name_map = {label: name for label, name in zip(sorted(df["Cluster"].unique()), top_words)}
+        df["Topic"] = df["Cluster"].map(cluster_name_map)
+        manager.initialize_table(table_name)
+        manager.add_entry(table_name, df.to_dict(orient="records"))
+    elif triggered_id == "load-data-db":  # Load existing LanceDB table
+        if not db_table:
+            raise PreventUpdate
+        items = manager.get_content_table(db_table)
+        df = pd.DataFrame(items)
+        df = df.dropna(axis=1, how='all')
+        df = df.fillna('')
+        #umap_embeddings = np.array(df["umap_embeddings"].tolist())
+    else:
+        raise PreventUpdate
+    # Plotting
+    return create_umap_plot(df)
+@app.callback(
+    Output("point-details", "children"),
+    Input("umap-graph", "clickData")
+)
+def show_point_details(clickData):
+    if not clickData:
+        return html.Div("🖱️ Click a point to see more details.", style={"color": "#888"})
+    img_url, title, desc = clickData["points"][0]["customdata"]
+    return html.Div([
+        html.H4(title),
+        html.Img(src=img_url, style={"maxWidth": "100%", "marginBottom": "10px"}),
+        html.P(desc or "No description available.")
+    ])
+# -------------------- Utility --------------------
+def harvest_omeka_items(client, item_set_id=None, per_page=50):
+    """
+    Fetch and parse items from Omeka S.
+    Args:
+        client: OmekaSClient instance
+        item_set_id: ID of the item set to fetch items from (optional)
+        per_page: Number of items to fetch per page (default: 50)
+    Returns:
+        DataFrame containing parsed item data
+    """
+    print("\n--- Fetching and Parsing Multiple Items by colection---")
+    try:
+        # Fetch first 5 items
+        items_list = client.list_all_items(item_set_id=item_set_id, per_page=per_page)
+        print(items_list)
+        print(f"Fetched {len(items_list)} items.")
+        parsed_items_list = []
+        for item_raw in items_list:
+            if 'o:media' in item_raw:
+                parsed = client.digest_item_data(item_raw, prefixes=_DEFAULT_PARSE_METADATA)
+                if parsed: # Only add if parsing was successful
+                    # Add media
+                    medias_id = [x["o:id"] for x in item_raw["o:media"]]
+                    medias_list = []
+                    for media_id in medias_id:
+                        media = client.get_media(media_id)
+                        if "image" in media["o:media_type"]:
+                          medias_list.append(media.get('o:original_url'))
+                    if medias_list: # Only append if there are image URLs
+                      parsed["images_urls"] = medias_list
+                      parsed_items_list.append(parsed)
+                      print(f"Successfully parsed {len(parsed_items_list)} items.")
+                print(f"Successfully parsed {len(parsed_items_list)} items.")
+        # Note: List columns (like dcterms:title) might need further handling in Pandas
+        print("\nDataFrame from parsed items:")
+        return pd.DataFrame(parsed_items_list)
+    except OmekaSClientError as e:
+        print(f"Error fetching/parsing multiple items: {e}")
+    except Exception as e:
+        print(f"An unexpected error occurred during multi-item parsing: {e}")
+def create_umap_plot(df):
+    coords = np.array(df["umap_embeddings"].tolist())
+    fig = px.scatter(
+        df, x=coords[:, 0], y=coords[:, 1],
+        color="Topic",
+        custom_data=["images_urls", "Title", "Description"],
+        hover_data=None,
+        title="UMAP Projection with HDBSCAN Topics"
+    )
+    fig.update_traces(
+        marker=dict(size=8, line=dict(width=1, color="DarkSlateGrey")),
+        hovertemplate="<b>%{customdata[1]}</b><br><img src='%{customdata[0]}' height='150'><extra></extra>"
+    )
+    fig.update_layout(height=700, margin=dict(t=30, b=30, l=30, r=30))
+    return fig, f"Loaded {len(df)} items and projected into 2D."
+if __name__ == "__main__":
+    app.run(debug=True)

helpers.py ADDED Viewed

	@@ -0,0 +1,146 @@

+from transformers import AutoTokenizer, AutoModel, AutoImageProcessor
+from sentence_transformers import SentenceTransformer
+import torch
+import torch.nn.functional as F
+from PIL import Image
+import requests
+import os
+import json
+import math
+import re
+import pandas as pd
+import numpy as np
+from omeka_s_api_client import OmekaSClient,OmekaSClientError
+from typing import List, Dict, Any, Union
+import io
+from dotenv import load_dotenv
+# env var
+load_dotenv(os.path.join(os.getcwd(), ".env"))
+HF_TOKEN = os.environ.get("HF_TOKEN")
+# Nomic vison model
+processor = AutoImageProcessor.from_pretrained("nomic-ai/nomic-embed-vision-v1.5")
+vision_model = AutoModel.from_pretrained("nomic-ai/nomic-embed-vision-v1.5", trust_remote_code=True)
+# Nomic text model
+text_model = SentenceTransformer("nomic-ai/nomic-embed-text-v1.5", trust_remote_code=True, token=HF_TOKEN)
+def image_url_to_pil(url: str, max_size=(512, 512)) -> Image:
+    """
+    Ex usage : image_blobs = df["image_url"].apply(image_url_to_pil).tolist()
+    """
+    response = requests.get(url, stream=True, timeout=5)
+    response.raise_for_status()
+    image = Image.open(io.BytesIO(response.content)).convert("RGB")
+    image.thumbnail(max_size, Image.Resampling.LANCZOS)
+    return image
+def generate_img_embed(images_urls, batch_size=20):
+    """Generate image embeddings in batches to manage memory usage.
+    Args:
+        images_urls (list): List of image URLs
+        batch_size (int): Number of images to process at once
+    """
+    all_embeddings = []
+    for i in range(0, len(images_urls), batch_size):
+        batch_urls = images_urls[i:i + batch_size]
+        images = [image_url_to_pil(image_url) for image_url in batch_urls]
+        inputs = processor(images, return_tensors="pt")
+        img_emb = vision_model(**inputs).last_hidden_state
+        img_embeddings = F.normalize(img_emb[:, 0], p=2, dim=1)
+        all_embeddings.append(img_embeddings.detach().numpy())
+    return np.vstack(all_embeddings)
+def generate_text_embed(sentences: List, batch_size=64):
+    """Generate text embeddings in batches to manage memory usage.
+    Args:
+        sentences (List): List of text strings to encode
+        batch_size (int): Number of sentences to process at once
+    """
+    all_embeddings = []
+    for i in range(0, len(sentences), batch_size):
+        batch_sentences = sentences[i:i + batch_size]
+        embeddings = text_model.encode(batch_sentences)
+        all_embeddings.append(embeddings)
+    return np.vstack(all_embeddings)
+def add_concatenated_text_field_exclude_keys(item_dict, keys_to_exclude=None, text_field_key="text", pair_separator=" - "):
+    if not isinstance(item_dict, dict):
+        raise TypeError("Input must be a dictionary.")
+    if keys_to_exclude is None:
+        keys_to_exclude = set() # Default to empty set
+    else:
+        keys_to_exclude = set(keys_to_exclude) # Ensure it's a set for efficient lookup
+    # Add the target text key to the exclusion set automatically
+    keys_to_exclude.add(text_field_key)
+    formatted_pairs = []
+    for key, value in item_dict.items():
+        # 1. Skip any key in the exclusion set
+        if key in keys_to_exclude:
+            continue
+        # 2. Check for empty/invalid values (same logic as before)
+        is_empty_or_invalid = False
+        if value is None: is_empty_or_invalid = True
+        elif isinstance(value, float) and math.isnan(value): is_empty_or_invalid = True
+        elif isinstance(value, (str, list, tuple, dict)) and len(value) == 0: is_empty_or_invalid = True
+        # 3. Format and add if valid
+        if not is_empty_or_invalid:
+            formatted_pairs.append(f"{str(key)}: {str(value)}")
+    concatenated_text = f"search_document: {pair_separator.join(formatted_pairs)}"
+    item_dict[text_field_key] = concatenated_text
+    return item_dict
+def prepare_df_atlas(df: pd.DataFrame, id_col='id', images_col='images_urls'):
+    # Drop completely empty columns
+    #df = df.dropna(axis=1, how='all')
+    # Fill remaining nulls with empty strings
+    #df = df.fillna('')
+    # Ensure ID column exists
+    if id_col not in df.columns:
+        df[id_col] = [f'{i}' for i in range(len(df))]
+    # Ensure indexed field exists and is not empty
+    #if indexed_col not in df.columns:
+    #    df[indexed_col] = ''
+    #df[images_col] = df[images_col].apply(lambda x: [x[0]] if isinstance(x, list) and len(x) > 1 else x if isinstance(x, list) else [x])
+    df[images_col] = df[images_col].apply(lambda x: x[0] if isinstance(x, list) else x)
+    # Optional: force all to string (can help with weird dtypes)
+    for col in df.columns:
+        df[col] = df[col].astype(str)
+    return df
+def remove_key_value_from_dict(list_of_dict, key_to_remove):
+    new_list = []
+    for dictionary in list_of_dict:
+        new_dict = dictionary.copy()  # Create a copy to avoid modifying the original list
+        if key_to_remove in new_dict:
+            del new_dict[key_to_remove]
+        new_list.append(new_dict)
+    return new_list
+def remove_key_value_from_dict(input_dict, key_to_remove='text'):
+    if not isinstance(input_dict, dict):
+        raise TypeError("Input must be a dictionary.")
+    if key_to_remove in input_dict:
+        del input_dict[key_to_remove]
+    return input_dict

lancedb_client.py ADDED Viewed

	@@ -0,0 +1,100 @@

+import numpy as np
+import lancedb
+import pyarrow as pa
+import logging
+from dotenv import load_dotenv
+import os
+import ast
+# Load env vars
+load_dotenv(os.path.join(os.getcwd(), ".env"),override = True)
+metadata_keys_raw = os.getenv("_DEFAULT_PARSE_METADATA", "").split(",")
+metadata_keys = [key.replace(" ", "").replace(")", "").strip("'") for key in metadata_keys_raw]
+# Setup logger
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+class LanceDBManager:
+    def __init__(self, db_uri="lancedb", embedding_dim=1536):
+        self.db = lancedb.connect(db_uri)
+        self.embedding_dim = embedding_dim
+        logger.info(f"Connected to LanceDB at {db_uri}")
+    def _build_schema(self):
+        """Build LanceDB schema with dynamic metadata fields and embedding vector."""
+        fields = [
+            pa.field("id", pa.int64()),
+            pa.field("item_id", pa.string()),
+            pa.field("images_urls", pa.string()),
+            pa.field("text", pa.string()),
+            pa.field("Cluster", pa.string()),
+            pa.field("Topic", pa.string()),
+            pa.field("embeddings", pa.list_(pa.float32(), self.embedding_dim)),
+            pa.field("umap_embeddings", pa.list_(pa.float32(), 2)),
+        ]
+        # Add fields from metadata
+        for key in metadata_keys:
+            sanitized_key = key.split(":")[1].strip().capitalize() # remove the vocabulary prefix in key label and capitalize
+            fields.append(pa.field(sanitized_key, pa.string()))
+        return pa.schema(fields)
+    def create_table(self, table_name):
+        """Create table using dynamic schema."""
+        try:
+            schema = self._build_schema()
+            table = self.db.create_table(table_name, schema=schema)
+            logger.info(f"Created LanceDB table '{table_name}'")
+            return table
+        except Exception as e:
+            logger.error(f"Failed to create table '{table_name}': {e}")
+            raise
+    def retrieve_table(self, table_name):
+        try:
+            table = self.db.open_table(table_name)
+            logger.info(f"Opened existing LanceDB table '{table_name}'")
+            return table
+        except Exception as e:
+            logger.error(f"Failed to open table '{table_name}': {e}")
+            raise
+    def initialize_table(self, table_name):
+        try:
+            return self.retrieve_table(table_name)
+        except Exception:
+            logger.info(f"Table '{table_name}' not found. Creating new.")
+            return self.create_table(table_name)
+    def add_entry(self, table_name, items):
+        table = self.initialize_table(table_name)
+        table.add(items)
+        logger.info(f"Added items to table '{table_name}'")
+    def list_tables(self):
+        """List all existing tables in the LanceDB instance."""
+        try:
+            tables = self.db.table_names()
+            logger.info("Retrieved list of tables.")
+            return tables
+        except Exception as e:
+            logger.error(f"Failed to list tables: {e}")
+            raise
+    def get_content_table(self, table_name):
+        table = self.initialize_table(table_name)
+        return table.to_pandas()
+    def drop_table(self, table_name):
+        """remove an existing table by name."""
+        try:
+            table = self.db.drop_table(table_name)
+            logger.info(f"Remove existing LanceDB table '{table_name}' successfully.")
+            return table
+        except Exception as e:
+            logger.error(f"Failed to remove existing table '{table_name}': {e}")
+            raise

requirements.txt ADDED Viewed

	@@ -0,0 +1,18 @@

+pandas
+numpy
+dash
+dash-bootstrap-components
+plotly
+datasets
+pillow
+transformers
+sentence-transformers
+einops
+lancedb
+requests
+umap-learn
+scikit-learn
+hdbscan
+git+https://github.com/gegedenice/omeka-s-api-client.git
+python-dotenv
+gunicorn