# /// script # [tool.marimo.runtime] # auto_instantiate = false # /// import marimo __generated_with = "0.13.0" app = marimo.App(width="medium") @app.cell def _(): import marimo as mo import spacy import polars as pl import altair as alt from transformers import AutoTokenizer import math import hashlib # Load spaCy models for English and Japanese nlp_en = spacy.load("en_core_web_md") nlp_ja = spacy.load("ja_core_news_md") # List of tokenizer models llm_model_choices = [ "meta-llama/Llama-4-Scout-17B-16E-Instruct", "google/gemma-3-27b-it", "deepseek-ai/DeepSeek-R1", "mistralai/Mistral-Small-3.1-24B-Instruct-2503", "Qwen/Qwen2.5-72B-Instruct", "google-bert/bert-large-uncased", "openai-community/gpt2", ] return ( AutoTokenizer, alt, hashlib, llm_model_choices, math, mo, nlp_en, nlp_ja, pl, ) @app.cell def _(mo): mo.md("# Tokenization for English and Japanese") return @app.cell def _(mo): # Central state for the text input content get_text_content, set_text_content = mo.state("") return get_text_content, set_text_content @app.cell def _(mo): # Placeholder texts en_placeholder = """ Mrs. Ferrars died on the night of the 16th–17th September—a Thursday. I was sent for at eight o’clock on the morning of Friday the 17th. There was nothing to be done. She had been dead some hours. """.strip() ja_placeholder = """ 吾輩は猫である。名前はまだ無い。 どこで生れたかとんと見当がつかぬ。何でも薄暗いじめじめした所でニャーニャー泣いていた事だけは記憶している。 """.strip() # Create UI element for language selection language_selector = mo.ui.radio( options=["English", "Japanese"], value="English", label="Language" ) # Return selector and placeholders return en_placeholder, ja_placeholder, language_selector @app.cell def _( en_placeholder, get_text_content, ja_placeholder, language_selector, mo, set_text_content, ): # Define text_input dynamically based on language current_placeholder = ( en_placeholder if language_selector.value == "English" else ja_placeholder ) text_input = mo.ui.text_area( # Read value from state value=get_text_content(), label="Enter text", placeholder=current_placeholder, full_width=True, # Update state on user input on_change=lambda v: set_text_content(v), ) return current_placeholder, text_input @app.cell def _(current_placeholder, mo, set_text_content): def apply_placeholder(): set_text_content(current_placeholder) apply_placeholder_button = mo.ui.button( label="Use Placeholder Text", on_click=lambda _: apply_placeholder() ) return (apply_placeholder_button,) @app.cell def _(apply_placeholder_button, language_selector, mo, text_input): mo.vstack( [ text_input, mo.hstack([language_selector, apply_placeholder_button], justify="start"), ] ) return @app.cell def _(get_text_content, language_selector, mo, nlp_en, nlp_ja): # Analyze text using spaCy based on selected language # Read text from state current_text = get_text_content() if language_selector.value == "English": doc = nlp_en(current_text) else: doc = nlp_ja(current_text) # Tokenized version and count tokenized_text = [token.text for token in doc] token_count = len(tokenized_text) mo.md( f"**Tokenized Text:** {' | '.join(tokenized_text)}\n\n**Token Count:** {token_count}" ) return current_text, doc @app.cell def _(doc, mo, pl): # Create a polars DataFrame with token attributes token_data = pl.DataFrame( { "Token": [token.text for token in doc], "Lemma": [token.lemma_ for token in doc], "POS": [token.pos_ for token in doc], "Tag": [token.tag_ for token in doc], "Morph": [ str(token.morph) for token in doc ], # To be more precise, this should be merged back in via .to_dict() "Token Position": list(range(len(doc))), "Sentence Number": [i for i, sent in enumerate(doc.sents) for token in sent], } ) mo.ui.dataframe(token_data, page_size=50) return (token_data,) @app.cell def _(mo): # Create UI element for selecting the column to visualize column_selector = mo.ui.dropdown( options=["POS", "Tag", "Lemma", "Token", "Morph"], value="POS", label="Select column to visualize", ) column_selector return (column_selector,) @app.cell def _(alt, column_selector, mo, token_data): mo.stop(token_data.is_empty(), "Please set input text.") selected_column = column_selector.value # Calculate value counts for the selected column counts_df = ( token_data[selected_column] .value_counts() .sort(by=["count", selected_column], descending=[True, False]) ) chart = ( alt.Chart(counts_df) .mark_bar() .encode( x=alt.X("count", title="Frequency"), y=alt.Y(selected_column, title=selected_column, sort=None), tooltip=[selected_column, "count"], ) .properties(title=f"{selected_column} Distribution") .interactive() ) mo.ui.altair_chart(chart) return @app.cell def _(llm_model_choices, mo): # UI for selecting the LLM tokenizer model llm_tokenizer_selector = mo.ui.dropdown( options=llm_model_choices, value=llm_model_choices[-1], # Default to gpt2 for faster loading initially label="Select LLM Tokenizer Model", ) llm_tokenizer_selector return (llm_tokenizer_selector,) @app.cell def _(AutoTokenizer, llm_tokenizer_selector): # Load the selected tokenizer # Adapted code from: https://huggingface.co/spaces/barttee/tokenizers/blob/main/app.py # This cell will re-run when llm_tokenizer_selector.value changes # Marimo caches the result implicitly based on inputs selected_model_name = llm_tokenizer_selector.value tokenizer = AutoTokenizer.from_pretrained(selected_model_name) return (tokenizer,) @app.cell def _(math): # Function to calculate token statistics def get_token_stats(tokens: list, original_text: str) -> dict: """Calculate enhanced statistics about the tokens.""" if not tokens: return { # Return default structure even for empty input "basic_stats": { "total_tokens": 0, "unique_tokens": 0, "compression_ratio": 0, "space_tokens": 0, "newline_tokens": 0, "special_tokens": 0, "punctuation_tokens": 0, "unique_percentage": 0, }, "length_stats": { "avg_length": 0, "std_dev": 0, "min_length": 0, "max_length": 0, "median_length": 0, }, } total_tokens = len(tokens) unique_tokens = len(set(tokens)) # Handle potential division by zero if total_tokens is 0 (already checked by `if not tokens`) avg_length = ( sum(len(t) for t in tokens) / total_tokens if total_tokens > 0 else 0 ) # Handle potential division by zero if total_tokens is 0 compression_ratio = len(original_text) / total_tokens if total_tokens > 0 else 0 # Token type analysis (Note: Heuristics might vary between tokenizers) # Using startswith(('Ġ', ' ')) covers common space markers like SentencePiece's U+2581 and BPE's 'Ġ' space_tokens = sum(1 for t in tokens if t.startswith(("Ġ", " "))) # Check for common newline representations newline_tokens = sum( 1 for t in tokens if "Ċ" in t or t == "\n" or t == "<0x0A>" ) # A broader definition for special tokens based on common patterns (control tokens) special_tokens = sum( 1 for t in tokens if (t.startswith("<") and t.endswith(">")) or (t.startswith("[") and t.endswith("]")) ) # Simple punctuation check (might overlap with other categories, focuses on single char punct) punctuation_tokens = sum( 1 for t in tokens if len(t) == 1 and not t.isalnum() and t not in [" ", "\n", "Ġ", "Ċ"] ) # Length distribution lengths = [len(t) for t in tokens] if not lengths: # Should not happen if tokens is not empty, but safe check return { "basic_stats": { "total_tokens": 0, "unique_tokens": 0, "compression_ratio": 0, "space_tokens": 0, "newline_tokens": 0, "special_tokens": 0, "punctuation_tokens": 0, "unique_percentage": 0, }, "length_stats": { "avg_length": 0, "std_dev": 0, "min_length": 0, "max_length": 0, "median_length": 0, }, } mean_length = sum(lengths) / len(lengths) variance = sum((x - mean_length) ** 2 for x in lengths) / len(lengths) std_dev = math.sqrt(variance) sorted_lengths = sorted(lengths) # Handle case where lengths list might be empty after filtering, though unlikely here median_length = sorted_lengths[len(lengths) // 2] if lengths else 0 return { "basic_stats": { "total_tokens": total_tokens, "unique_tokens": unique_tokens, "compression_ratio": round(compression_ratio, 2), "space_tokens": space_tokens, "newline_tokens": newline_tokens, "special_tokens": special_tokens, "punctuation_tokens": punctuation_tokens, "unique_percentage": round(unique_tokens / total_tokens * 100, 1) if total_tokens > 0 else 0, }, "length_stats": { "avg_length": round(avg_length, 2), "std_dev": round(std_dev, 2), "min_length": min(lengths) if lengths else 0, "max_length": max(lengths) if lengths else 0, "median_length": median_length, }, } return (get_token_stats,) @app.cell def _(hashlib): def get_varied_color(token: str) -> dict: """Generate vibrant colors with HSL for better visual distinction.""" # Use a fixed salt or seed if you want consistent colors across runs for the same token token_hash = hashlib.md5(token.encode()).hexdigest() hue = int(token_hash[:3], 16) % 360 saturation = 70 + (int(token_hash[3:5], 16) % 20) # Saturation between 70-90% lightness = 80 + ( int(token_hash[5:7], 16) % 10 ) # Lightness between 80-90% (light background) # Ensure text color contrasts well with the light background text_lightness = 20 # Dark text for light background return { "background": f"hsl({hue}, {saturation}%, {lightness}%)", "text": f"hsl({hue}, {saturation}%, {text_lightness}%)", } return (get_varied_color,) @app.function def fix_token(token: str) -> str: """Fix token for display with improved space visualization.""" print(token) # Replace SentencePiece space marker U+2581 with a middle dot token = token.replace(" ", "·") # Replace BPE space marker 'Ġ' with a middle dot if token.startswith("Ġ"): space_count = token.count("Ġ") return "·" * space_count + token[space_count:] # Replace newline markers for display token = token.replace( "Ċ", "↵\n" ) # Replace newline marker with symbol and actual newline token = token.replace("<0x0A>", "↵\n") # Handle byte representation of newline return token @app.function def get_tokenizer_info(tokenizer): """ Extract useful information from a tokenizer. Returns a dictionary with tokenizer details. """ print(tokenizer) info = {} try: # Get vocabulary size (dictionary size) if hasattr(tokenizer, "vocab_size"): info["vocab_size"] = tokenizer.vocab_size elif hasattr(tokenizer, "get_vocab"): info["vocab_size"] = len(tokenizer.get_vocab()) # Get model max length if available if ( hasattr(tokenizer, "model_max_length") and tokenizer.model_max_length < 1000000 ): # Sanity check for realistic values info["model_max_length"] = tokenizer.model_max_length else: info["model_max_length"] = "Not specified or very large" # Check tokenizer type info["tokenizer_type"] = tokenizer.__class__.__name__ # Get special tokens using the recommended attributes/methods special_tokens = {} # Prefer all_special_tokens if available if hasattr(tokenizer, "all_special_tokens"): for token in tokenizer.all_special_tokens: # Try to find the attribute name corresponding to the token value token_name = "unknown_special_token" # Default name for attr_name in [ "pad_token", "eos_token", "bos_token", "sep_token", "cls_token", "unk_token", "mask_token", ]: if ( hasattr(tokenizer, attr_name) and getattr(tokenizer, attr_name) == token ): token_name = attr_name break if token and str(token).strip(): special_tokens[token_name] = str(token) else: # Fallback to checking individual attributes for token_name in [ "pad_token", "eos_token", "bos_token", "sep_token", "cls_token", "unk_token", "mask_token", ]: if ( hasattr(tokenizer, token_name) and getattr(tokenizer, token_name) is not None ): token_value = getattr(tokenizer, token_name) if token_value and str(token_value).strip(): special_tokens[token_name] = str(token_value) info["special_tokens"] = special_tokens if special_tokens else "None found" except Exception as e: info["error"] = f"Error extracting tokenizer info: {str(e)}" return info @app.cell def _(mo): show_ids_switch = mo.ui.switch(label="Show Token IDs instead of Text", value=False) return (show_ids_switch,) @app.cell def _( current_text, get_token_stats, get_varied_color, llm_tokenizer_selector, mo, show_ids_switch, tokenizer, ): # --- Tokenization and Data Preparation --- # Get tokenizer metadata tokenizer_info = get_tokenizer_info(tokenizer) # Tokenize the input text # Use tokenize to get string representations for analysis and display all_tokens = tokenizer.tokenize(current_text) print(all_tokens) total_token_count = len(all_tokens) # Limit the number of tokens for display to avoid browser slowdown display_limit = 1000 display_tokens = all_tokens[:display_limit] display_limit_reached = total_token_count > display_limit # Generate data for visualization llm_token_data = [] for idx, token in enumerate(display_tokens): colors = get_varied_color(token) fixed_token_display = fix_token(token) # Apply fixes for display # Handle potential errors during ID conversion (e.g., unknown tokens if not handled by tokenizer) try: token_id = tokenizer.convert_tokens_to_ids(token) except KeyError: token_id = ( tokenizer.unk_token_id if hasattr(tokenizer, "unk_token_id") else -1 ) # Use UNK id or -1 llm_token_data.append( { "original": token, "display": fixed_token_display, "colors": colors, "is_newline": "↵" in fixed_token_display, # Check if it represents a newline "token_id": token_id, "token_index": idx, } ) # Calculate statistics using the full token list token_stats = get_token_stats(all_tokens, current_text) # Construct HTML for colored tokens html_parts = [] for item in llm_token_data: # Use pre-wrap to respect spaces and newlines within the token display style = f"background-color: {item['colors']['background']}; color: {item['colors']['text']}; padding: 1px 3px; margin: 1px; border-radius: 3px; display: inline-block; white-space: pre-wrap; line-height: 1.4;" # Add title attribute for hover info (original token + ID) title = f"Original: {item['original']}\nID: {item['token_id']}" display_content = str(item["token_id"]) if show_ids_switch.value else item["display"] html_parts.append( f'{display_content}' ) token_viz_html = mo.Html(f'