Spaces:

Val-2
/

PikaPikaGen

Sleeping

App Files Files Community

Val-2 commited on Jul 21

Commit

66347a3

1 Parent(s): 07c3151

First commit

Browse files

Files changed (32) hide show

.gitattributes +0 -34
.github/workflows/update_space.yml +28 -0
.gitignore +216 -0
.idea/.gitignore +8 -0
.idea/DeepLearning.iml +8 -0
.idea/deployment.xml +29 -0
.idea/inspectionProfiles/Project_Default.xml +34 -0
.idea/inspectionProfiles/profiles_settings.xml +6 -0
.idea/misc.xml +7 -0
.idea/modules.xml +8 -0
.idea/vcs.xml +6 -0
.python-version +1 -0
PikaPikaTraining.ipynb +112 -0
pikapikagen/PikaPikaGen.ipynb +2241 -0
pikapikagen/README.md +6 -0
pikapikagen/__init__.py +0 -0
pikapikagen/data_loader.py +100 -0
pikapikagen/dataset.py +141 -0
pikapikagen/discriminators.py +161 -0
pikapikagen/evaluate_kid.py +141 -0
pikapikagen/gradio_demo.py +291 -0
pikapikagen/losses.py +103 -0
pikapikagen/model.py +46 -0
pikapikagen/model_blocks/decoder_block.py +59 -0
pikapikagen/model_blocks/image_cross_attention.py +49 -0
pikapikagen/model_blocks/image_decoder.py +122 -0
pikapikagen/model_blocks/text_encoder.py +43 -0
pikapikagen/model_checkpoint/checkpoint_epoch_150.pth +3 -0
pikapikagen/plots.py +428 -0
pikapikagen/utils.py +12 -0
pyproject.toml +18 -0
uv.lock +0 -0

.gitattributes CHANGED Viewed

@@ -1,35 +1 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
 *.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text
























1	*.pth filter=lfs diff=lfs merge=lfs -text

.github/workflows/update_space.yml ADDED Viewed

	@@ -0,0 +1,28 @@

+name: Run Python script
+on:
+  push:
+    branches:
+      - main
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    steps:
+    - name: Checkout
+      uses: actions/checkout@v2
+    - name: Set up Python
+      uses: actions/setup-python@v2
+      with:
+        python-version: '3.9'
+    - name: Install Gradio
+      run: python -m pip install gradio
+    - name: Log in to Hugging Face
+      run: python -c 'import huggingface_hub; huggingface_hub.login(token="${{ secrets.hf_token }}")'
+    - name: Deploy to Spaces
+      run: gradio deploy

.gitignore ADDED Viewed

	@@ -0,0 +1,216 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[codz]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py.cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# UV
+#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#uv.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+#poetry.toml
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#   pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
+#   https://pdm-project.org/en/latest/usage/project/#working-with-version-control
+#pdm.lock
+#pdm.toml
+.pdm-python
+.pdm-build/
+# pixi
+#   Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
+#pixi.lock
+#   Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
+#   in the .venv directory. It is recommended not to include this directory in version control.
+.pixi
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.envrc
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+# Abstra
+# Abstra is an AI-powered process automation framework.
+# Ignore directories containing user credentials, local state, and settings.
+# Learn more at https://abstra.io/docs
+.abstra/
+# Visual Studio Code
+#  Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
+#  that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
+#  and can be added to the global gitignore or merged into this file. However, if you prefer,
+#  you could uncomment the following to ignore the entire vscode folder
+# .vscode/
+# Ruff stuff:
+.ruff_cache/
+# PyPI configuration file
+.pypirc
+# Cursor
+#  Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
+#  exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
+#  refer to https://docs.cursor.com/context/ignore-files
+.cursorignore
+.cursorindexingignore
+# Marimo
+marimo/_static/
+marimo/_lsp/
+__marimo__/
+# Streamlit
+.streamlit/secrets.toml
+# Project
+/pikapikagen/dataset
+/pikapikagen/training_output
+/dataset
+/old_notebooks/dataset

.idea/.gitignore ADDED Viewed

	@@ -0,0 +1,8 @@

+# Default ignored files
+/shelf/
+/workspace.xml
+# Editor-based HTTP Client requests
+/httpRequests/
+# Datasource local storage ignored files
+/dataSources/
+/dataSources.local.xml

.idea/DeepLearning.iml ADDED Viewed

	@@ -0,0 +1,8 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<module type="PYTHON_MODULE" version="4">
+  <component name="NewModuleRootManager">
+    <content url="file://$MODULE_DIR$" />
+    <orderEntry type="jdk" jdkName="Python 3.12 virtualenv at C:\Users\valer\Mega\Programming\DeepLearning\.venv" jdkType="Python SDK" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+</module>

.idea/deployment.xml ADDED Viewed

	@@ -0,0 +1,29 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="PublishConfigData" autoUpload="Always" remoteFilesAllowedToDisappearOnAutoupload="false">
+    <serverData>
+      <paths name="root@salad:22 agent">
+        <serverdata>
+          <mappings>
+            <mapping deploy="/tmp/pycharm_project_71" local="$PROJECT_DIR$" />
+          </mappings>
+        </serverdata>
+      </paths>
+      <paths name="val@46.101.132.64:22 key">
+        <serverdata>
+          <mappings>
+            <mapping local="$PROJECT_DIR$" web="/" />
+          </mappings>
+        </serverdata>
+      </paths>
+      <paths name="val@46.101.132.64:22 key (2)">
+        <serverdata>
+          <mappings>
+            <mapping local="$PROJECT_DIR$" web="/" />
+          </mappings>
+        </serverdata>
+      </paths>
+    </serverData>
+    <option name="myAutoUpload" value="ALWAYS" />
+  </component>
+</project>

.idea/inspectionProfiles/Project_Default.xml ADDED Viewed

	@@ -0,0 +1,34 @@

+<component name="InspectionProjectProfileManager">
+  <profile version="1.0">
+    <option name="myName" value="Project Default" />
+    <inspection_tool class="DuplicatedCode" enabled="true" level="WEAK WARNING" enabled_by_default="true">
+      <Languages>
+        <language minSize="49" name="Python" />
+      </Languages>
+    </inspection_tool>
+    <inspection_tool class="Eslint" enabled="true" level="WARNING" enabled_by_default="true" />
+    <inspection_tool class="Mypy" enabled="true" level="TYPO" enabled_by_default="true" editorAttributes="TYPO" />
+    <inspection_tool class="PyPep8NamingInspection" enabled="true" level="WEAK WARNING" enabled_by_default="true">
+      <option name="ignoredErrors">
+        <list>
+          <option value="N802" />
+          <option value="N803" />
+          <option value="N806" />
+        </list>
+      </option>
+    </inspection_tool>
+    <inspection_tool class="PyUnresolvedReferencesInspection" enabled="true" level="WARNING" enabled_by_default="true">
+      <option name="ignoredIdentifiers">
+        <list>
+          <option value="fitz.fitz.Page.MediaBox" />
+          <option value="color_tol" />
+        </list>
+      </option>
+    </inspection_tool>
+    <inspection_tool class="SpellCheckingInspection" enabled="false" level="TYPO" enabled_by_default="false">
+      <option name="processCode" value="true" />
+      <option name="processLiterals" value="true" />
+      <option name="processComments" value="true" />
+    </inspection_tool>
+  </profile>
+</component>

.idea/inspectionProfiles/profiles_settings.xml ADDED Viewed

	@@ -0,0 +1,6 @@

+<component name="InspectionProjectProfileManager">
+  <settings>
+    <option name="USE_PROJECT_PROFILE" value="false" />
+    <version value="1.0" />
+  </settings>
+</component>

.idea/misc.xml ADDED Viewed

	@@ -0,0 +1,7 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="Black">
+    <option name="sdkName" value="Python 3.12 virtualenv at C:\Users\valer\Mega\Programming\DeepLearning\.venv" />
+  </component>
+  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.12 virtualenv at C:\Users\valer\Mega\Programming\DeepLearning\.venv" project-jdk-type="Python SDK" />
+</project>

.idea/modules.xml ADDED Viewed

	@@ -0,0 +1,8 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/.idea/DeepLearning.iml" filepath="$PROJECT_DIR$/.idea/DeepLearning.iml" />
+    </modules>
+  </component>
+</project>

.idea/vcs.xml ADDED Viewed

	@@ -0,0 +1,6 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="VcsDirectoryMappings">
+    <mapping directory="" vcs="Git" />
+  </component>
+</project>

.python-version ADDED Viewed

	@@ -0,0 +1 @@


1	+ 3.12

PikaPikaTraining.ipynb ADDED Viewed

	@@ -0,0 +1,112 @@

+{
+    "cells": [
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "# PikaPikaGen: Training del Modello\n",
+                "\n",
+                "Questo notebook automatizza il processo di setup e avvio del training per il modello PikaPikaGen.\n",
+                "\n",
+                "I passaggi eseguiti sono:\n",
+                "1.  Clonazione del repository GitHub pubblico.\n",
+                "2.  Installazione delle dipendenze necessarie tramite `uv`.\n",
+                "3.  Esecuzione dello script di training `main.py`."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "print(\"Installazione delle dipendenze necessarie...\")\n",
+                "\n",
+                "# Assicurati che uv sia installato\n",
+                "%pip install uv\n",
+                "print(\"✅ uv installato con successo.\")\n",
+                "\n",
+                "# Controlla se torch è già installato\n",
+                "try:\n",
+                "    import torch\n",
+                "    print(f\"✅ PyTorch già installato (versione: {torch.__version__})\")\n",
+                "    torch_installed = True\n",
+                "except ImportError:\n",
+                "    print(\"❌ PyTorch non trovato, sarà installato\")\n",
+                "    torch_installed = False\n",
+                "\n",
+                "# Lista delle dipendenze principali del progetto\n",
+                "dependencies = [\n",
+                "    \"transformers\",\n",
+                "    \"pandas\",\n",
+                "    \"tqdm\",\n",
+                "    \"matplotlib\",\n",
+                "    \"Pillow\",\n",
+                "    \"requests\",\n",
+                "    \"ipywidgets\"\n",
+                "]\n",
+                "\n",
+                "# Aggiungi torch e torchvision solo se non sono già installati\n",
+                "if not torch_installed:\n",
+                "    dependencies.extend([\"torch\", \"torchvision\"])\n",
+                "\n",
+                "print(\"Installazione delle dipendenze con uv...\")\n",
+                "deps_str = \" \".join(dependencies)\n",
+                "if torch_installed:\n",
+                "    !uv pip install {deps_str}\n",
+                "else:\n",
+                "    !uv pip install {deps_str} --torch-backend=auto\n",
+                "print(\"✅ Dipendenze principali installate con successo.\")\n"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "import os\n",
+                "\n",
+                "repo_url = \"https://github.com/val-2/DeepLearning\"\n",
+                "branch = \"main\"\n",
+                "repo_name = repo_url.split('/')[-1]\n",
+                "\n",
+                "print(f\"Clonazione del repository: {repo_url}\")\n",
+                "\n",
+                "# Check if we're already in the repo directory\n",
+                "current_dir = os.path.basename(os.getcwd())\n",
+                "if current_dir == repo_name:\n",
+                "    print(f\"Già nella directory del repository '{repo_name}'. Aggiornamento...\")\n",
+                "    !git fetch\n",
+                "    !git pull\n",
+                "    !git checkout {branch}\n",
+                "elif os.path.exists(repo_name):\n",
+                "    print(f\"La directory '{repo_name}' esiste già. Aggiornamento del repository...\")\n",
+                "    os.chdir(repo_name)\n",
+                "    !git fetch\n",
+                "    !git pull\n",
+                "    !git checkout {branch}\n",
+                "else:\n",
+                "    print(\"Clonazione del repository...\")\n",
+                "    !git clone -b {branch} {repo_url}\n",
+                "    os.chdir(repo_name)\n",
+                "\n",
+                "# Spostati nella directory del repository\n",
+                "print(f\"Directory di lavoro corrente: {os.getcwd()}\")"
+            ]
+        }
+    ],
+    "metadata": {
+        "kernelspec": {
+            "display_name": ".venv",
+            "language": "python",
+            "name": "python3"
+        },
+        "language_info": {
+            "name": "python",
+            "version": "3.12.11"
+        }
+    },
+    "nbformat": 4,
+    "nbformat_minor": 2
+}

pikapikagen/PikaPikaGen.ipynb ADDED Viewed

	@@ -0,0 +1,2241 @@

+{
+ "cells": [
+  {
+   "cell_type": "raw",
+   "metadata": {
+    "id": "VDSaH9SVsnNl",
+    "vscode": {
+     "languageId": "raw"
+    }
+   },
+   "source": [
+    "# PikaPikaGen: Text-to-Image Pokemon Sprite Generation with GAN\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Install required packages\n",
+    "#!pip install torch torchvision transformers pandas pillow requests matplotlib tqdm ipywidgets gradio torch-fidelity\n",
+    "\n",
+    "import torch\n",
+    "import torch.nn as nn\n",
+    "import torch.optim as optim\n",
+    "import torch.nn.functional as F\n",
+    "\n",
+    "import numpy as np\n",
+    "import matplotlib.pyplot as plt\n",
+    "import os\n",
+    "from tqdm import tqdm\n",
+    "from transformers import AutoTokenizer\n",
+    "import warnings\n",
+    "warnings.filterwarnings('ignore')\n",
+    "\n",
+    "# Set device\n",
+    "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
+    "print(f\"Using device: {device}\")\n",
+    "\n",
+    "# Set random seeds for reproducibility\n",
+    "RANDOM_SEED = 42\n",
+    "torch.manual_seed(RANDOM_SEED)\n",
+    "np.random.seed(RANDOM_SEED)"
+   ]
+  },
+  {
+   "cell_type": "raw",
+   "metadata": {
+    "id": "-rrtsHGqsnNo",
+    "vscode": {
+     "languageId": "raw"
+    }
+   },
+   "source": [
+    "## 1. Data Loading and Preprocessing\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {
+    "id": "aeVuv1YCsnNp"
+   },
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "import torchvision.transforms as T\n",
+    "\n",
+    "\n",
+    "class AugmentationPipeline:\n",
+    "    def __init__(self, p=0.8):\n",
+    "        self.p = p\n",
+    "        self.transforms = T.RandomApply([\n",
+    "            T.RandomHorizontalFlip(p=0.5),\n",
+    "\n",
+    "            T.RandomAffine(degrees=10, translate=(0.05, 0.05), scale=(0.95, 1.05), fill=1),\n",
+    "\n",
+    "            T.ColorJitter(brightness=0.1, contrast=0.1, saturation=0.1, hue=0.1),\n",
+    "\n",
+    "            T.RandomErasing(p=0.15, scale=(0.02, 0.1), ratio=(0.3, 3.3), value='random'),\n",
+    "        ], p=self.p)\n",
+    "\n",
+    "    def apply(self, images):\n",
+    "        return self.transforms(images)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/",
+     "height": 1000,
+     "referenced_widgets": [
+      "5efdceae0bac4c978d3a7226247e237f",
+      "a39c5c623a3e42448e109fb9ec6bc263",
+      "a6ed2ddb1c6f4d1aa945c5a39372f781",
+      "8cf950b898e142c1af9b4db92019aa4d",
+      "8ed7abd0602c43a1bfc0f96d7611d429",
+      "65ba2d78fde14bb2baf5ae1101d7e5ff",
+      "4795a78a75dc439a8da7df58bf738940",
+      "4545ff199b874d3680a83918513e1d4b",
+      "cad8fd90586443778568a1babb8c40e6",
+      "57e526d188b9414dabb3b1c895373864",
+      "8226a55726c54abba3a48dbfa8e1b6f6",
+      "86a3c1a4e9eb4989b23364f21e5df531",
+      "5ba39d9d997a45ca848e3e2ffd0e7307",
+      "4c22e1b396f342ffb90c1b50a0051862",
+      "370e5663868f411697bfb24f4e3efa09",
+      "3a338ac4d2944030a07843d8ea24e9fd",
+      "128f4312bcdc4166b9e24d8cdd34184d",
+      "1b65d6c8540e4f458886d5e7075ab30a",
+      "a5a9f8607fdd4f9cad7519eca573f3dc",
+      "926149594f94457295c60b4fad9cbac7",
+      "7e89bc79516f405e9684eacdce7b4551",
+      "c917f3a000fb44338e4afbeabeaab55f"
+     ]
+    },
+    "id": "ppTYW-n5snNp",
+    "outputId": "4d7a3003-296a-458c-a339-aeacf5232c91"
+   },
+   "outputs": [],
+   "source": [
+    "from data_loader import create_training_setup\n",
+    "from utils import denormalize_image\n",
+    "\n",
+    "tokenizer = AutoTokenizer.from_pretrained('prajjwal1/bert-mini')\n",
+    "\n",
+    "# train_augmentation_pipeline = AugmentationPipeline()\n",
+    "# Create the complete training setup using the function from pokemon_dataset.py\n",
+    "print(\"Creating training setup with train/val split and fixed batches...\")\n",
+    "training_setup = create_training_setup(\n",
+    "    tokenizer=tokenizer,\n",
+    "    test_set_size=0.2,\n",
+    "    val_set_size=0.1,\n",
+    "    batch_size=16,\n",
+    "    num_workers=0,\n",
+    "    num_viz_samples=4,\n",
+    "    random_seed=42,\n",
+    "    train_augmentation_pipeline=None\n",
+    ")\n",
+    "\n",
+    "# Extract components\n",
+    "train_loader = training_setup['train_loader']\n",
+    "val_loader = training_setup['val_loader']\n",
+    "fixed_train_batch = training_setup['fixed_train_batch']\n",
+    "fixed_val_batch = training_setup['fixed_val_batch']\n",
+    "fixed_train_attention_batch = training_setup['fixed_train_attention_batch']\n",
+    "fixed_val_attention_batch = training_setup['fixed_val_attention_batch']\n",
+    "\n",
+    "print(\"Training setup complete!\")\n",
+    "print(f\"Train loader batches: {len(train_loader)}\")\n",
+    "print(f\"Val loader batches: {len(val_loader)}\")\n",
+    "\n",
+    "# Test the training setup with fixed batches\n",
+    "print(\"\\nFixed batch shapes:\")\n",
+    "print(f\"  Train batch - Images: {fixed_train_batch['image'].shape}\")\n",
+    "print(f\"  Train batch - Text: {fixed_train_batch['text'].shape}\")\n",
+    "print(f\"  Train batch - Attention: {fixed_train_batch['attention_mask'].shape}\")\n",
+    "print(f\"  Val batch - Images: {fixed_val_batch['image'].shape}\")\n",
+    "\n",
+    "# Display sample images from fixed batches\n",
+    "fig, axes = plt.subplots(2, 4, figsize=(16, 8))\n",
+    "for i in range(4):\n",
+    "    # Fixed train batch images\n",
+    "    train_img = denormalize_image(fixed_train_batch['image'][i])\n",
+    "    axes[0, i].imshow(train_img.permute(1, 2, 0))\n",
+    "    axes[0, i].set_title(f\"Train: {fixed_train_batch['pokemon_name'][i]}\")\n",
+    "    axes[0, i].axis('off')\n",
+    "\n",
+    "    # Fixed val batch images\n",
+    "    val_img = denormalize_image(fixed_val_batch['image'][i])\n",
+    "    axes[1, i].imshow(val_img.permute(1, 2, 0))\n",
+    "    axes[1, i].set_title(f\"Val: {fixed_val_batch['pokemon_name'][i]}\")\n",
+    "    axes[1, i].axis('off')\n",
+    "\n",
+    "plt.suptitle(\"Fixed Batches for Training Visualization\", fontsize=16)\n",
+    "plt.tight_layout()\n",
+    "plt.show()\n",
+    "\n",
+    "\n",
+    "print(\"\\n✅ Dataset and batches loaded successfully from pokemon_dataset.py functionality!\")\n",
+    "print(\"Ready for training with proper train/val split and fixed visualization batches.\")\n"
+   ]
+  },
+  {
+   "cell_type": "raw",
+   "metadata": {
+    "id": "eJSVrf3ysnNq",
+    "vscode": {
+     "languageId": "raw"
+    }
+   },
+   "source": [
+    "## 2. Model Architecture Implementation\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/",
+     "height": 923,
+     "referenced_widgets": [
+      "bdf500351aea42698c6d6dd5a99021f3",
+      "ab61b90c1a5b4a2b9bb5c9d5a215bb3f",
+      "dc03fed540b74f3aa4a1b17ebf2c81d3",
+      "5837f2c4668646c0a6db2407aebb46e3",
+      "edeb423e9ff84e5c8a0d790368d68bba",
+      "bf8eb066cdaf4ac096dc14392d085daf",
+      "4e32e76c44fb449c8cb767abeb17868a",
+      "5c3cb981f324446eae642f7c23a539f0",
+      "2fe9614fe5984fa6b887d1e1b3e18b04",
+      "64277772cc30408e8ea29f0e268c8880",
+      "5b0d55ea20714104818097bd7d1f509a",
+      "7e21c6a9c7f44496b6f28513caefb631",
+      "439eba0eb4184c0ab83f65fc26bbe388",
+      "eee695744ec64aa7b71b9e85968c6f8f",
+      "c4ecdc9d982f49129368893c1c0aece9",
+      "5f5e7ff6e4c845b99602a4fa00ad550a",
+      "304d50e74ad744cdb3a7cc88739cb923",
+      "bfcc6d01c9ff4db698afa4318e7c91ac",
+      "b2bf751bb96746e4a828241f70e52050",
+      "828b227361fe45cd83964149e7475503",
+      "58ab975eaba2485cb0945482c26ecf3d",
+      "d0b4e43ab5cd4edda6cc061b36bf10a3"
+     ]
+    },
+    "id": "RnNQM3_ysnNr",
+    "outputId": "6905696e-05d6-4d97-dd9b-9dc36eea95b7"
+   },
+   "outputs": [],
+   "source": [
+    "from model import Generator\n",
+    "\n",
+    "# Test the generator\n",
+    "generator = Generator().to(device)\n",
+    "with torch.no_grad():\n",
+    "    generated_images_256, generated_images_64 = generator(\n",
+    "        fixed_train_batch['text'][:2].to(device),\n",
+    "        fixed_train_batch['attention_mask'][:2].to(device)\n",
+    "    )\n",
+    "print(f\"Generator output shape 256x256: {generated_images_256.shape}\")\n",
+    "print(f\"Generator output shape 64x64: {generated_images_64.shape}\")\n",
+    "\n",
+    "print(\"Generator test\")\n",
+    "plt.figure(figsize=(12, 8))\n",
+    "for i in range(2):\n",
+    "    # 256x256 images\n",
+    "    plt.subplot(2, 2, i+1)\n",
+    "    img_256 = denormalize_image(generated_images_256[i].cpu())\n",
+    "    plt.imshow(img_256.permute(1, 2, 0))\n",
+    "    plt.title(f\"Generated 256x256 Sample {i+1}\")\n",
+    "    plt.axis('off')\n",
+    "\n",
+    "    # 64x64 images\n",
+    "    plt.subplot(2, 2, i+3)\n",
+    "    img_64 = denormalize_image(generated_images_64[i].cpu())\n",
+    "    plt.imshow(img_64.permute(1, 2, 0))\n",
+    "    plt.title(f\"Generated 64x64 Sample {i+1}\")\n",
+    "    plt.axis('off')\n",
+    "plt.tight_layout()\n",
+    "plt.show()\n"
+   ]
+  },
+  {
+   "cell_type": "raw",
+   "metadata": {
+    "id": "7drCU21JsnNs",
+    "vscode": {
+     "languageId": "raw"
+    }
+   },
+   "source": [
+    "## 3. Training Setup and Utilities\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "iQdhzEQQsnNs",
+    "outputId": "2dbee275-3b6d-43da-8929-97e21403821f"
+   },
+   "outputs": [],
+   "source": [
+    "from discriminators import Discriminator256, Discriminator64\n",
+    "from losses import VGGPerceptualLoss, SobelLoss\n",
+    "from plots import save_attention_visualization\n",
+    "\n",
+    "def weights_init(m):\n",
+    "    \"\"\"Initialize model weights according to the original DCGAN paper\"\"\"\n",
+    "    classname = m.__class__.__name__\n",
+    "    if classname.find('Conv') != -1:\n",
+    "        nn.init.normal_(m.weight.data, 0.0, 0.02)\n",
+    "    elif classname.find('BatchNorm') != -1:\n",
+    "        nn.init.normal_(m.weight.data, 1.0, 0.02)\n",
+    "        nn.init.constant_(m.bias.data, 0)\n",
+    "\n",
+    "generator = Generator().to(device)\n",
+    "discriminator_256 = Discriminator256().to(device)\n",
+    "discriminator_64 = Discriminator64().to(device)\n",
+    "\n",
+    "generator.apply(weights_init)\n",
+    "discriminator_256.apply(weights_init)\n",
+    "discriminator_64.apply(weights_init)\n",
+    "\n",
+    "\n",
+    "# Optimizer params\n",
+    "lr = 0.0002\n",
+    "beta1 = 0.5\n",
+    "beta2 = 0.999\n",
+    "\n",
+    "optimizer_G = optim.Adam(generator.parameters(), lr=lr, betas=(beta1, beta2))\n",
+    "optimizer_D_256 = optim.Adam(discriminator_256.parameters(), lr=lr, betas=(beta1, beta2))\n",
+    "optimizer_D_64 = optim.Adam(discriminator_64.parameters(), lr=lr, betas=(beta1, beta2))\n",
+    "\n",
+    "adv_criterion = nn.BCEWithLogitsLoss().to(device)  # no sigmoid at the end of discriminators\n",
+    "l1_criterion = nn.L1Loss().to(device)\n",
+    "perc_criterion = VGGPerceptualLoss(device)\n",
+    "sobel_criterion = SobelLoss().to(device)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from typing import TypedDict\n",
+    "import torch\n",
+    "from plots import save_comparison_grid\n",
+    "\n",
+    "# Create checkpoint saving directory\n",
+    "os.makedirs('models', exist_ok=True)\n",
+    "\n",
+    "# TypedDicts to pass and return many object at once, without\n",
+    "class LossesDict(TypedDict):\n",
+    "    \"\"\"History of training losses\"\"\"\n",
+    "    generator: list[float]\n",
+    "    discriminator: list[float]\n",
+    "    l1: list[float]\n",
+    "    perceptual: list[float]\n",
+    "    sobel: list[float]\n",
+    "\n",
+    "class ValidationLossesDict(TypedDict):\n",
+    "    \"\"\"History of validation losses\"\"\"\n",
+    "    l1: list[float]\n",
+    "    perceptual: list[float]\n",
+    "    sobel: list[float]\n",
+    "    total: list[float]\n",
+    "\n",
+    "class DiscriminatorComponentsDict(TypedDict):\n",
+    "    \"\"\"Components of the discriminator loss\"\"\"\n",
+    "    real_uncond: float\n",
+    "    real_cond: float\n",
+    "    real_cond_wrong: float\n",
+    "    fake_uncond: float\n",
+    "\n",
+    "class ValidationResultsDict(TypedDict):\n",
+    "    \"\"\"Single losses for validation\"\"\"\n",
+    "    l1: float\n",
+    "    perceptual: float\n",
+    "    sobel: float\n",
+    "    total: float\n",
+    "\n",
+    "# Training history\n",
+    "losses: LossesDict = {\n",
+    "    'generator': [],\n",
+    "    'discriminator': [],\n",
+    "    'l1': [],\n",
+    "    'perceptual': [],\n",
+    "    'sobel': [],\n",
+    "}\n",
+    "\n",
+    "# Validation history\n",
+    "val_losses: ValidationLossesDict = {\n",
+    "    'l1': [],\n",
+    "    'perceptual': [],\n",
+    "    'sobel': [],\n",
+    "    'total': [],\n",
+    "}\n",
+    "\n",
+    "def validate_model(generator, val_loader, device, l1_criterion, perc_criterion, sobel_criterion) -> ValidationResultsDict:\n",
+    "    \"\"\"\n",
+    "    Validate the model on the validation set\n",
+    "    Returns validation losses\n",
+    "    \"\"\"\n",
+    "    generator.eval()\n",
+    "\n",
+    "    val_l1_loss = 0.0\n",
+    "    val_perc_loss = 0.0\n",
+    "    val_sobel_loss = 0.0\n",
+    "    num_batches = 0\n",
+    "\n",
+    "    with torch.no_grad():\n",
+    "        for batch in val_loader:\n",
+    "            # Move data to device\n",
+    "            real_images = batch['image'].to(device)\n",
+    "            text_ids = batch['text'].to(device)\n",
+    "            attention_mask = batch['attention_mask'].to(device)\n",
+    "\n",
+    "            # Generate images\n",
+    "            generated_images, _ = generator(text_ids, attention_mask)\n",
+    "\n",
+    "            # Calculate validation losses (no adversarial loss)\n",
+    "            batch_l1_loss = l1_criterion(generated_images, real_images)\n",
+    "            batch_perc_loss = perc_criterion(generated_images, real_images)\n",
+    "            batch_sobel_loss = sobel_criterion(generated_images, real_images)\n",
+    "\n",
+    "            val_l1_loss += batch_l1_loss.item()\n",
+    "            val_perc_loss += batch_perc_loss.item()\n",
+    "            val_sobel_loss += batch_sobel_loss.item()\n",
+    "            num_batches += 1\n",
+    "\n",
+    "    # Calculate averages\n",
+    "    avg_val_l1 = val_l1_loss / num_batches\n",
+    "    avg_val_perc = val_perc_loss / num_batches\n",
+    "    avg_val_sobel = val_sobel_loss / num_batches\n",
+    "    avg_val_total = avg_val_l1 + avg_val_perc + avg_val_sobel\n",
+    "\n",
+    "    # Set models back to training mode\n",
+    "    generator.train()\n",
+    "\n",
+    "    return ValidationResultsDict(\n",
+    "        l1=avg_val_l1,\n",
+    "        perceptual=avg_val_perc,\n",
+    "        sobel=avg_val_sobel,\n",
+    "        total=avg_val_total\n",
+    "    )\n",
+    "\n",
+    "def create_mismatched_text_batch(text_ids, attention_mask):\n",
+    "    \"\"\"Create a batch with mismatched text for wrong text conditioning\"\"\"\n",
+    "    batch_size = text_ids.size(0)\n",
+    "    indices = torch.randperm(batch_size)\n",
+    "    return text_ids[indices], attention_mask[indices]\n",
+    "\n",
+    "def compute_discriminator_loss(\n",
+    "    discriminator,\n",
+    "    real_images,\n",
+    "    fake_images,\n",
+    "    text_ids,\n",
+    "    attention_mask,\n",
+    "    wrong_text_ids,\n",
+    "    wrong_attention_mask,\n",
+    "    real_labels,\n",
+    "    fake_labels,\n",
+    "    adv_criterion\n",
+    ") -> tuple[torch.Tensor, DiscriminatorComponentsDict]:\n",
+    "    \"\"\"Compute discriminator loss with the 4 components.\n",
+    "    Returns the total loss and the 4 components.\"\"\"\n",
+    "    # Real images with correct text\n",
+    "    real_uncond, real_cond = discriminator(real_images, text_ids, attention_mask, return_both=True)\n",
+    "    real_uncond_loss = adv_criterion(real_uncond, real_labels)\n",
+    "    real_cond_loss = adv_criterion(real_cond, real_labels)\n",
+    "\n",
+    "    # Real images with wrong text\n",
+    "    _, real_cond_wrong = discriminator(real_images, wrong_text_ids, wrong_attention_mask, return_both=True)\n",
+    "    real_cond_wrong_loss = adv_criterion(real_cond_wrong, fake_labels)\n",
+    "\n",
+    "    # Fake images with wrong text\n",
+    "    fake_uncond, _ = discriminator(fake_images.detach(), wrong_text_ids, wrong_attention_mask, return_both=True)\n",
+    "    fake_uncond_loss = adv_criterion(fake_uncond, fake_labels)\n",
+    "\n",
+    "    total_loss = (real_uncond_loss + real_cond_loss + real_cond_wrong_loss + fake_uncond_loss) / 4\n",
+    "\n",
+    "    components: DiscriminatorComponentsDict = {\n",
+    "        'real_uncond': real_uncond_loss.item(),\n",
+    "        'real_cond': real_cond_loss.item(),\n",
+    "        'real_cond_wrong': real_cond_wrong_loss.item(),\n",
+    "        'fake_uncond': fake_uncond_loss.item(),\n",
+    "    }\n",
+    "\n",
+    "    return total_loss, components\n",
+    "\n",
+    "def compute_generator_adversarial_loss(\n",
+    "    discriminator,\n",
+    "    fake_images,\n",
+    "    text_ids,\n",
+    "    attention_mask,\n",
+    "    real_labels,\n",
+    "    adv_criterion\n",
+    ") -> torch.Tensor:\n",
+    "    \"\"\"Compute generator adversarial loss for one discriminator\"\"\"\n",
+    "    fake_uncond, fake_cond = discriminator(fake_images, text_ids, attention_mask, return_both=True)\n",
+    "    uncond_loss = adv_criterion(fake_uncond, real_labels)\n",
+    "    cond_loss = adv_criterion(fake_cond, real_labels)\n",
+    "    return (uncond_loss + cond_loss) / 2\n",
+    "\n",
+    "def compute_loss(fake_images, real_images, criterion, lmd):\n",
+    "    \"\"\"Compute a reconstruction loss only if its lambda > 0\"\"\"\n",
+    "    return criterion(fake_images, real_images) if lmd > 0 else torch.tensor(0.0, device=device)\n",
+    "\n",
+    "\n",
+    "epoch = 0\n",
+    "noise_dim = 100\n"
+   ]
+  },
+  {
+   "cell_type": "raw",
+   "metadata": {
+    "id": "Oenm8AkasnNt",
+    "vscode": {
+     "languageId": "raw"
+    }
+   },
+   "source": [
+    "## 4. GAN Training Loop"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/",
+     "height": 1000
+    },
+    "id": "gmo0Mi6osnNt",
+    "outputId": "a4691684-def3-4d29-d00d-20ae40287c8c"
+   },
+   "outputs": [],
+   "source": [
+    "from IPython.display import clear_output\n",
+    "\n",
+    "total_epochs = 150\n",
+    "display_interval = 1  # To show generation of training sample\n",
+    "save_interval = 15  # To save checkpoint\n",
+    "clear_interval = 22  # To clear cell output. If too high or not present, Kaggle page would crash\n",
+    "\n",
+    "lambda_l1 = 1.0\n",
+    "lambda_adv = 1.0\n",
+    "lambda_perceptual = 0.0\n",
+    "lambda_sobel = 0.0\n",
+    "\n",
+    "real_label = 1.0\n",
+    "fake_label = 0.0\n",
+    "\n",
+    "print(\"Starting training with dual discriminators...\")\n",
+    "\n",
+    "for epoch in range(epoch, total_epochs):\n",
+    "    epoch_g_loss = 0.0\n",
+    "    epoch_d_loss_64 = 0.0\n",
+    "    epoch_d_loss_256 = 0.0\n",
+    "    epoch_l1_loss = 0.0\n",
+    "    epoch_perc_loss = 0.0\n",
+    "    epoch_sobel_loss = 0.0\n",
+    "\n",
+    "    # Track discriminator loss components\n",
+    "    epoch_d256_components: DiscriminatorComponentsDict = {'real_uncond': 0.0, 'real_cond': 0.0, 'real_cond_wrong': 0.0, 'fake_uncond': 0.0}\n",
+    "    epoch_d64_components: DiscriminatorComponentsDict = {'real_uncond': 0.0, 'real_cond': 0.0, 'real_cond_wrong': 0.0, 'fake_uncond': 0.0}\n",
+    "\n",
+    "    progress_bar = tqdm(train_loader, desc=f\"Epoch {epoch+1}/{total_epochs}\")\n",
+    "\n",
+    "    for i, batch in enumerate(progress_bar):\n",
+    "        batch_size = batch['image'].size(0)\n",
+    "\n",
+    "        # Move data to device\n",
+    "        real_images = batch['image'].to(device)\n",
+    "        text_ids = batch['text'].to(device)\n",
+    "        attention_mask = batch['attention_mask'].to(device)\n",
+    "\n",
+    "        # Create labels and mismatched text for GAN training\n",
+    "        real_labels = torch.full((batch_size, 1), real_label, device=device, dtype=torch.float)\n",
+    "        fake_labels = torch.full((batch_size, 1), fake_label, device=device, dtype=torch.float)\n",
+    "        wrong_text_ids, wrong_attention_mask = create_mismatched_text_batch(text_ids, attention_mask)\n",
+    "\n",
+    "        # Generate fake images\n",
+    "        fake_images_256, fake_images_64 = generator(text_ids, attention_mask)\n",
+    "        real_images_64 = F.interpolate(real_images, size=(64, 64), mode='bilinear', align_corners=False)\n",
+    "\n",
+    "        # Training both discriminators\n",
+    "        optimizer_D_256.zero_grad()\n",
+    "        optimizer_D_64.zero_grad()\n",
+    "\n",
+    "        d_loss_256, d256_components = compute_discriminator_loss(\n",
+    "            discriminator_256, real_images, fake_images_256,\n",
+    "            text_ids, attention_mask, wrong_text_ids, wrong_attention_mask,\n",
+    "            real_labels, fake_labels, adv_criterion\n",
+    "        )\n",
+    "        d_loss_256.backward()\n",
+    "\n",
+    "        d_loss_64, d64_components = compute_discriminator_loss(\n",
+    "            discriminator_64, real_images_64, fake_images_64,\n",
+    "            text_ids, attention_mask, wrong_text_ids, wrong_attention_mask,\n",
+    "            real_labels, fake_labels, adv_criterion\n",
+    "        )\n",
+    "        d_loss_64.backward()\n",
+    "\n",
+    "        optimizer_D_256.step()\n",
+    "        optimizer_D_64.step()\n",
+    "\n",
+    "        # Training generator\n",
+    "        optimizer_G.zero_grad()\n",
+    "\n",
+    "        # Adversarial losses for both discriminators\n",
+    "        g_adv_loss_256 = compute_generator_adversarial_loss(\n",
+    "            discriminator_256, fake_images_256, text_ids, attention_mask, real_labels, adv_criterion\n",
+    "        )\n",
+    "        g_adv_loss_64 = compute_generator_adversarial_loss(\n",
+    "            discriminator_64, fake_images_64, text_ids, attention_mask, real_labels, adv_criterion\n",
+    "        )\n",
+    "        adversarial_loss = (g_adv_loss_256 + g_adv_loss_64) / 2\n",
+    "\n",
+    "        # Compute losses if their lambda is > 0\n",
+    "        l1_loss = compute_loss(fake_images_256, real_images, l1_criterion, lambda_l1)\n",
+    "        perc_loss = compute_loss(fake_images_256, real_images, perc_criterion, lambda_perceptual)\n",
+    "        sobel_loss = compute_loss(fake_images_256, real_images, sobel_criterion, lambda_sobel)\n",
+    "\n",
+    "        # Total generator loss\n",
+    "        g_loss = (\n",
+    "            lambda_adv * adversarial_loss +\n",
+    "            lambda_l1 * l1_loss +\n",
+    "            lambda_perceptual * perc_loss +\n",
+    "            lambda_sobel * sobel_loss\n",
+    "        )\n",
+    "        g_loss.backward()\n",
+    "        optimizer_G.step()\n",
+    "\n",
+    "        # Update loss tracking\n",
+    "        epoch_g_loss += g_loss.item()\n",
+    "        epoch_d_loss_256 += d_loss_256.item()\n",
+    "        epoch_d_loss_64 += d_loss_64.item()\n",
+    "        epoch_l1_loss += l1_loss.item()\n",
+    "        epoch_perc_loss += perc_loss.item()\n",
+    "        epoch_sobel_loss += sobel_loss.item()\n",
+    "\n",
+    "        # Update discriminator component tracking\n",
+    "        for key in epoch_d256_components:\n",
+    "            epoch_d256_components[key] += d256_components[key]\n",
+    "            epoch_d64_components[key] += d64_components[key]\n",
+    "\n",
+    "        # Update progress bar with detailed losses and loss components\n",
+    "        progress_bar.set_postfix({\n",
+    "            'G': f'{g_loss.item():.3f}',\n",
+    "            'L1': f'{l1_loss.item():.3f}',\n",
+    "            'Adv': f'{adversarial_loss.item():.3f}',\n",
+    "            'D256': f'{d_loss_256.item():.3f}',\n",
+    "            'D256_ru': f'{d256_components[\"real_uncond\"]:.3f}',\n",
+    "            'D256_rc': f'{d256_components[\"real_cond\"]:.3f}',\n",
+    "            'D256_rcw': f'{d256_components[\"real_cond_wrong\"]:.3f}',\n",
+    "            'D256_fu': f'{d256_components[\"fake_uncond\"]:.3f}',\n",
+    "            'D64': f'{d_loss_64.item():.3f}',\n",
+    "            'D64_ru': f'{d64_components[\"real_uncond\"]:.3f}',\n",
+    "            'D64_rc': f'{d64_components[\"real_cond\"]:.3f}',\n",
+    "            'D64_rcw': f'{d64_components[\"real_cond_wrong\"]:.3f}',\n",
+    "            'D64_fu': f'{d64_components[\"fake_uncond\"]:.3f}',\n",
+    "        })\n",
+    "\n",
+    "    # Calculate average losses for the epoch\n",
+    "    avg_g_loss = epoch_g_loss / len(train_loader)\n",
+    "    avg_d_loss_256 = epoch_d_loss_256 / len(train_loader)\n",
+    "    avg_d_loss_64 = epoch_d_loss_64 / len(train_loader)\n",
+    "    avg_l1_loss = epoch_l1_loss / len(train_loader)\n",
+    "    avg_perc_loss = epoch_perc_loss / len(train_loader)\n",
+    "    avg_sobel_loss = epoch_sobel_loss / len(train_loader)\n",
+    "\n",
+    "    # Calculate average discriminator components for epoch\n",
+    "    avg_d256_components = {key: val / len(train_loader) for key, val in epoch_d256_components.items()}\n",
+    "    avg_d64_components = {key: val / len(train_loader) for key, val in epoch_d64_components.items()}\n",
+    "\n",
+    "    # Store losses (combine discriminator losses)\n",
+    "    losses['generator'].append(avg_g_loss)\n",
+    "    losses['discriminator'].append((avg_d_loss_256 + avg_d_loss_64) / 2)\n",
+    "    losses['l1'].append(avg_l1_loss)\n",
+    "    losses['perceptual'].append(avg_perc_loss)\n",
+    "    losses['sobel'].append(avg_sobel_loss)\n",
+    "\n",
+    "    print(f\"Running validation for epoch {epoch+1}...\")\n",
+    "    validation_results = validate_model(generator, val_loader, device, l1_criterion, perc_criterion, sobel_criterion)\n",
+    "\n",
+    "    # Store validation losses\n",
+    "\n",
+    "    for k, v in validation_results.items():\n",
+    "        val_losses[k].append(v)\n",
+    "\n",
+    "    if (epoch + 1) % clear_interval == 0:\n",
+    "        clear_output(wait=True)\n",
+    "\n",
+    "    print(f\"Epoch [{epoch+1}/{total_epochs}]\")\n",
+    "    print(f\"  Train - D_256: {avg_d_loss_256:.4f}, D_64: {avg_d_loss_64:.4f}, G_loss: {avg_g_loss:.4f}\")\n",
+    "    print(f\"  D_256 Components - RU: {avg_d256_components['real_uncond']:.4f}, RC: {avg_d256_components['real_cond']:.4f}, RCW: {avg_d256_components['real_cond_wrong']:.4f}, FU: {avg_d256_components['fake_uncond']:.4f}\")\n",
+    "    print(f\"  D_64 Components  - RU: {avg_d64_components['real_uncond']:.4f}, RC: {avg_d64_components['real_cond']:.4f}, RCW: {avg_d64_components['real_cond_wrong']:.4f}, FU: {avg_d64_components['fake_uncond']:.4f}\")\n",
+    "    print(f\"  Train - L1: {avg_l1_loss:.4f}, Perceptual: {avg_perc_loss:.4f}, Sobel: {avg_sobel_loss:.4f}\")\n",
+    "    print(f\"  Val   - L1: {validation_results['l1']:.4f}, Perceptual: {validation_results['perceptual']:.4f}, Sobel: {validation_results['sobel']:.4f}, Total: {validation_results['total']:.4f}\")\n",
+    "    print(\"  Legend: RU=RealUncond, RC=RealCond, RCW=RealCondWrong, FU=FakeUncond\")\n",
+    "\n",
+    "    # Display generated images\n",
+    "    if (epoch + 1) % display_interval == 0:\n",
+    "        print(f\"\\nGenerating sample images at epoch {epoch+1}:\")\n",
+    "        print(\"256x256 Training Images:\")\n",
+    "        save_comparison_grid(epoch+1, generator, fixed_train_batch, \"train_256\", device, show_inline=True)\n",
+    "        print(\"64x64 Training Images:\")\n",
+    "        save_comparison_grid(epoch+1, generator, fixed_train_batch, \"train_64\", device, show_inline=True)\n",
+    "\n",
+    "    # Save checkpoint and show visualizations\n",
+    "    if (epoch + 1) % save_interval == 0:\n",
+    "        checkpoint_path = f'models/checkpoint_epoch_{epoch+1}.pth'\n",
+    "        all_losses = {'train': losses, 'val': val_losses}\n",
+    "        checkpoint = {\n",
+    "            'epoch': epoch,\n",
+    "            'generator_state_dict': generator.state_dict(),\n",
+    "            'discriminator_256_state_dict': discriminator_256.state_dict(),\n",
+    "            'discriminator_64_state_dict': discriminator_64.state_dict(),\n",
+    "            'g_optimizer_state_dict': optimizer_G.state_dict(),\n",
+    "            'd_optimizer_state_dict': optimizer_D_256.state_dict(),\n",
+    "            'd_64_optimizer_state_dict': optimizer_D_64.state_dict(),\n",
+    "            'losses': all_losses\n",
+    "        }\n",
+    "        torch.save(checkpoint, checkpoint_path)\n",
+    "        print(f\"Checkpoint saved to {checkpoint_path}\")\n",
+    "\n",
+    "        print(\"256x256 Validation Images:\")\n",
+    "        save_comparison_grid(epoch+1, generator, fixed_val_batch, \"val_256\", device, show_inline=True)\n",
+    "        print(\"64x64 Validation Images:\")\n",
+    "        save_comparison_grid(epoch+1, generator, fixed_val_batch, \"val_64\", device, show_inline=True)\n",
+    "        save_attention_visualization(epoch+1, generator, tokenizer, fixed_train_batch, device, \"train\", show_inline=True)\n",
+    "        save_attention_visualization(epoch+1, generator, tokenizer, fixed_val_batch, device, \"val\", show_inline=True)\n",
+    "\n",
+    "print(\"Training completed!\")\n"
+   ]
+  },
+  {
+   "cell_type": "raw",
+   "metadata": {
+    "id": "rbv1Wz4csnNu",
+    "vscode": {
+     "languageId": "raw"
+    }
+   },
+   "source": [
+    "## 5. Training Analysis and Visualization\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "l_90zE2CsnNu"
+   },
+   "outputs": [],
+   "source": [
+    "from plots import save_plot_losses, save_plot_non_gan_losses\n",
+    "\n",
+    "\n",
+    "save_plot_losses(\n",
+    "    losses_g=losses['generator'],\n",
+    "    losses_d=losses['discriminator'],\n",
+    "    output_dir=\"training_output\",\n",
+    "    show_inline=True\n",
+    ")\n",
+    "\n",
+    "# Plot training vs validation losses for non-GAN components (so except \"generator\" and \"discriminator\" from losses)\n",
+    "# Convert to list of dicts format expected by save_plot_non_gan_losses\n",
+    "train_losses_history = []\n",
+    "val_losses_history = []\n",
+    "\n",
+    "for i in range(len(losses['l1'])):\n",
+    "    train_losses_history.append({\n",
+    "        'l1': losses['l1'][i],\n",
+    "        'perceptual': losses['perceptual'][i],\n",
+    "        'sobel': losses['sobel'][i],\n",
+    "        'total': losses['l1'][i] + losses['perceptual'][i] + losses['sobel'][i]\n",
+    "    })\n",
+    "\n",
+    "for i in range(len(val_losses['l1'])):\n",
+    "    val_losses_history.append({\n",
+    "        'l1': val_losses['l1'][i],\n",
+    "        'perceptual': val_losses['perceptual'][i],\n",
+    "        'sobel': val_losses['sobel'][i],\n",
+    "        'total': val_losses['total'][i]\n",
+    "    })\n",
+    "\n",
+    "save_plot_non_gan_losses(\n",
+    "    train_losses_history=train_losses_history,\n",
+    "    val_losses_history=val_losses_history,\n",
+    "    output_dir=\"training_output\",\n",
+    "    show_inline=True\n",
+    ")\n",
+    "\n",
+    "# Print final statistics\n",
+    "print(f\"Final Train - Generator Loss: {losses['generator'][-1]:.4f}\")\n",
+    "print(f\"Final Train - Discriminator Loss: {losses['discriminator'][-1]:.4f}\")\n",
+    "print(f\"Final Train - L1 Loss: {losses['l1'][-1]:.4f}\")\n",
+    "print(f\"Final Train - Perceptual Loss: {losses['perceptual'][-1]:.4f}\")\n",
+    "print(f\"Final Train - Sobel Loss: {losses['sobel'][-1]:.4f}\")\n",
+    "\n",
+    "print(f\"Final Val   - L1 Loss: {val_losses['l1'][-1]:.4f}\")\n",
+    "print(f\"Final Val   - Perceptual Loss: {val_losses['perceptual'][-1]:.4f}\")\n",
+    "print(f\"Final Val   - Sobel Loss: {val_losses['sobel'][-1]:.4f}\")\n",
+    "print(f\"Final Val   - Total Loss: {val_losses['total'][-1]:.4f}\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "Io7I7RTqsnNu"
+   },
+   "outputs": [],
+   "source": [
+    "# Generate a grid of final results\n",
+    "print(\"Final Results - Generated Pokemon Sprites (256x256):\")\n",
+    "batch = next(iter(train_loader))\n",
+    "save_comparison_grid(0, generator, batch, \"final_256\", device, show_inline=True)\n",
+    "\n",
+    "print(\"Final Results - Generated Pokemon Sprites (64x64):\")\n",
+    "save_comparison_grid(0, generator, batch, \"final_64\", device, show_inline=True)\n"
+   ]
+  },
+  {
+   "cell_type": "raw",
+   "metadata": {
+    "id": "3a_jxGvCsnNu",
+    "vscode": {
+     "languageId": "raw"
+    }
+   },
+   "source": [
+    "## 7. Model Analysis\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def count_parameters(model):\n",
+    "    return sum(p.numel() for p in model.parameters() if p.requires_grad)\n",
+    "\n",
+    "print(f\"Generator parameters: {count_parameters(generator):,}\")\n",
+    "print(f\"Discriminator (256) parameters: {count_parameters(discriminator_256):,}\")\n",
+    "print(f\"Discriminator (64) parameters: {count_parameters(discriminator_64):,}\")\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "accelerator": "GPU",
+  "colab": {
+   "gpuType": "T4",
+   "provenance": []
+  },
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.11"
+  },
+  "widgets": {
+   "application/vnd.jupyter.widget-state+json": {
+    "128f4312bcdc4166b9e24d8cdd34184d": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "1b65d6c8540e4f458886d5e7075ab30a": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "DescriptionStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "DescriptionStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "description_width": ""
+     }
+    },
+    "2fe9614fe5984fa6b887d1e1b3e18b04": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "ProgressStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "ProgressStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "bar_color": null,
+      "description_width": ""
+     }
+    },
+    "304d50e74ad744cdb3a7cc88739cb923": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "370e5663868f411697bfb24f4e3efa09": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HTMLModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HTMLModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HTMLView",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_7e89bc79516f405e9684eacdce7b4551",
+      "placeholder": "",
+      "style": "IPY_MODEL_c917f3a000fb44338e4afbeabeaab55f",
+      "value": " 232k/? [00:00&lt;00:00, 12.0MB/s]"
+     }
+    },
+    "3a338ac4d2944030a07843d8ea24e9fd": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "439eba0eb4184c0ab83f65fc26bbe388": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HTMLModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HTMLModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HTMLView",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_304d50e74ad744cdb3a7cc88739cb923",
+      "placeholder": "",
+      "style": "IPY_MODEL_bfcc6d01c9ff4db698afa4318e7c91ac",
+      "value": "model.safetensors: 100%"
+     }
+    },
+    "4545ff199b874d3680a83918513e1d4b": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "4795a78a75dc439a8da7df58bf738940": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "DescriptionStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "DescriptionStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "description_width": ""
+     }
+    },
+    "4c22e1b396f342ffb90c1b50a0051862": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "FloatProgressModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "FloatProgressModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "ProgressView",
+      "bar_style": "success",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_a5a9f8607fdd4f9cad7519eca573f3dc",
+      "max": 1,
+      "min": 0,
+      "orientation": "horizontal",
+      "style": "IPY_MODEL_926149594f94457295c60b4fad9cbac7",
+      "value": 1
+     }
+    },
+    "4e32e76c44fb449c8cb767abeb17868a": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "DescriptionStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "DescriptionStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "description_width": ""
+     }
+    },
+    "57e526d188b9414dabb3b1c895373864": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "5837f2c4668646c0a6db2407aebb46e3": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HTMLModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HTMLModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HTMLView",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_64277772cc30408e8ea29f0e268c8880",
+      "placeholder": "",
+      "style": "IPY_MODEL_5b0d55ea20714104818097bd7d1f509a",
+      "value": " 45.1M/45.1M [00:00&lt;00:00, 112MB/s]"
+     }
+    },
+    "58ab975eaba2485cb0945482c26ecf3d": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "5b0d55ea20714104818097bd7d1f509a": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "DescriptionStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "DescriptionStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "description_width": ""
+     }
+    },
+    "5ba39d9d997a45ca848e3e2ffd0e7307": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HTMLModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HTMLModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HTMLView",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_128f4312bcdc4166b9e24d8cdd34184d",
+      "placeholder": "",
+      "style": "IPY_MODEL_1b65d6c8540e4f458886d5e7075ab30a",
+      "value": "vocab.txt: "
+     }
+    },
+    "5c3cb981f324446eae642f7c23a539f0": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "5efdceae0bac4c978d3a7226247e237f": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HBoxModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HBoxModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HBoxView",
+      "box_style": "",
+      "children": [
+       "IPY_MODEL_a39c5c623a3e42448e109fb9ec6bc263",
+       "IPY_MODEL_a6ed2ddb1c6f4d1aa945c5a39372f781",
+       "IPY_MODEL_8cf950b898e142c1af9b4db92019aa4d"
+      ],
+      "layout": "IPY_MODEL_8ed7abd0602c43a1bfc0f96d7611d429"
+     }
+    },
+    "5f5e7ff6e4c845b99602a4fa00ad550a": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "64277772cc30408e8ea29f0e268c8880": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "65ba2d78fde14bb2baf5ae1101d7e5ff": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "7e21c6a9c7f44496b6f28513caefb631": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HBoxModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HBoxModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HBoxView",
+      "box_style": "",
+      "children": [
+       "IPY_MODEL_439eba0eb4184c0ab83f65fc26bbe388",
+       "IPY_MODEL_eee695744ec64aa7b71b9e85968c6f8f",
+       "IPY_MODEL_c4ecdc9d982f49129368893c1c0aece9"
+      ],
+      "layout": "IPY_MODEL_5f5e7ff6e4c845b99602a4fa00ad550a"
+     }
+    },
+    "7e89bc79516f405e9684eacdce7b4551": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "8226a55726c54abba3a48dbfa8e1b6f6": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "DescriptionStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "DescriptionStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "description_width": ""
+     }
+    },
+    "828b227361fe45cd83964149e7475503": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "ProgressStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "ProgressStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "bar_color": null,
+      "description_width": ""
+     }
+    },
+    "86a3c1a4e9eb4989b23364f21e5df531": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HBoxModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HBoxModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HBoxView",
+      "box_style": "",
+      "children": [
+       "IPY_MODEL_5ba39d9d997a45ca848e3e2ffd0e7307",
+       "IPY_MODEL_4c22e1b396f342ffb90c1b50a0051862",
+       "IPY_MODEL_370e5663868f411697bfb24f4e3efa09"
+      ],
+      "layout": "IPY_MODEL_3a338ac4d2944030a07843d8ea24e9fd"
+     }
+    },
+    "8cf950b898e142c1af9b4db92019aa4d": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HTMLModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HTMLModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HTMLView",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_57e526d188b9414dabb3b1c895373864",
+      "placeholder": "",
+      "style": "IPY_MODEL_8226a55726c54abba3a48dbfa8e1b6f6",
+      "value": " 286/286 [00:00&lt;00:00, 25.2kB/s]"
+     }
+    },
+    "8ed7abd0602c43a1bfc0f96d7611d429": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "926149594f94457295c60b4fad9cbac7": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "ProgressStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "ProgressStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "bar_color": null,
+      "description_width": ""
+     }
+    },
+    "a39c5c623a3e42448e109fb9ec6bc263": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HTMLModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HTMLModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HTMLView",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_65ba2d78fde14bb2baf5ae1101d7e5ff",
+      "placeholder": "",
+      "style": "IPY_MODEL_4795a78a75dc439a8da7df58bf738940",
+      "value": "config.json: 100%"
+     }
+    },
+    "a5a9f8607fdd4f9cad7519eca573f3dc": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": "20px"
+     }
+    },
+    "a6ed2ddb1c6f4d1aa945c5a39372f781": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "FloatProgressModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "FloatProgressModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "ProgressView",
+      "bar_style": "success",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_4545ff199b874d3680a83918513e1d4b",
+      "max": 286,
+      "min": 0,
+      "orientation": "horizontal",
+      "style": "IPY_MODEL_cad8fd90586443778568a1babb8c40e6",
+      "value": 286
+     }
+    },
+    "ab61b90c1a5b4a2b9bb5c9d5a215bb3f": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HTMLModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HTMLModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HTMLView",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_bf8eb066cdaf4ac096dc14392d085daf",
+      "placeholder": "",
+      "style": "IPY_MODEL_4e32e76c44fb449c8cb767abeb17868a",
+      "value": "pytorch_model.bin: 100%"
+     }
+    },
+    "b2bf751bb96746e4a828241f70e52050": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "bdf500351aea42698c6d6dd5a99021f3": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HBoxModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HBoxModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HBoxView",
+      "box_style": "",
+      "children": [
+       "IPY_MODEL_ab61b90c1a5b4a2b9bb5c9d5a215bb3f",
+       "IPY_MODEL_dc03fed540b74f3aa4a1b17ebf2c81d3",
+       "IPY_MODEL_5837f2c4668646c0a6db2407aebb46e3"
+      ],
+      "layout": "IPY_MODEL_edeb423e9ff84e5c8a0d790368d68bba"
+     }
+    },
+    "bf8eb066cdaf4ac096dc14392d085daf": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "bfcc6d01c9ff4db698afa4318e7c91ac": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "DescriptionStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "DescriptionStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "description_width": ""
+     }
+    },
+    "c4ecdc9d982f49129368893c1c0aece9": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HTMLModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HTMLModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HTMLView",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_58ab975eaba2485cb0945482c26ecf3d",
+      "placeholder": "",
+      "style": "IPY_MODEL_d0b4e43ab5cd4edda6cc061b36bf10a3",
+      "value": " 45.1M/45.1M [00:00&lt;00:00, 89.1MB/s]"
+     }
+    },
+    "c917f3a000fb44338e4afbeabeaab55f": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "DescriptionStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "DescriptionStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "description_width": ""
+     }
+    },
+    "cad8fd90586443778568a1babb8c40e6": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "ProgressStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "ProgressStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "bar_color": null,
+      "description_width": ""
+     }
+    },
+    "d0b4e43ab5cd4edda6cc061b36bf10a3": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "DescriptionStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "DescriptionStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "description_width": ""
+     }
+    },
+    "dc03fed540b74f3aa4a1b17ebf2c81d3": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "FloatProgressModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "FloatProgressModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "ProgressView",
+      "bar_style": "success",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_5c3cb981f324446eae642f7c23a539f0",
+      "max": 45106985,
+      "min": 0,
+      "orientation": "horizontal",
+      "style": "IPY_MODEL_2fe9614fe5984fa6b887d1e1b3e18b04",
+      "value": 45106985
+     }
+    },
+    "edeb423e9ff84e5c8a0d790368d68bba": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "eee695744ec64aa7b71b9e85968c6f8f": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "FloatProgressModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "FloatProgressModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "ProgressView",
+      "bar_style": "success",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_b2bf751bb96746e4a828241f70e52050",
+      "max": 45084768,
+      "min": 0,
+      "orientation": "horizontal",
+      "style": "IPY_MODEL_828b227361fe45cd83964149e7475503",
+      "value": 45084768
+     }
+    }
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}

pikapikagen/README.md ADDED Viewed

	@@ -0,0 +1,6 @@

+---
+title: pikapikagen
+app_file: gradio_demo.py
+sdk: gradio
+sdk_version: 5.35.0
+---

pikapikagen/__init__.py ADDED Viewed

File without changes

pikapikagen/data_loader.py ADDED Viewed

	@@ -0,0 +1,100 @@

+from torch.utils.data import DataLoader, Subset
+import torch
+from dataset import PokemonDataset
+import math
+def create_training_setup(
+    tokenizer,
+    test_set_size,
+    val_set_size,
+    batch_size,
+    num_workers=0,
+    num_viz_samples=4,
+    random_seed=42,
+    train_augmentation_pipeline=None,
+):
+    """
+    Create a complete setup for training with dataset, dataloaders and fixed batches for visualization.
+    """
+    assert 0 <= test_set_size < 1.0, "test_set_size must be a float between 0 and 1"
+    assert 0 <= val_set_size < 1.0, "val_set_size must be a float between 0 and 1"
+    assert (test_set_size + val_set_size) < 1.0, "The sum of test and validation sizes must be less than 1"
+    train_full_dataset = PokemonDataset(tokenizer=tokenizer, augmentation_transforms=train_augmentation_pipeline)
+    # Don't use augmentation for test and validation
+    test_val_full_dataset = PokemonDataset(tokenizer=tokenizer)
+    dataset_size = len(train_full_dataset)
+    # Create a random reproducible permutation
+    generator = torch.Generator().manual_seed(random_seed)
+    shuffled_indices = torch.randperm(dataset_size, generator=generator)
+    val_count = math.ceil(val_set_size * dataset_size)
+    test_count = math.ceil(test_set_size * dataset_size)
+    train_count = dataset_size - val_count - test_count
+    # Partition based on the computed splits
+    train_indices = shuffled_indices[:train_count].tolist()
+    test_indices = shuffled_indices[train_count : train_count + test_count].tolist()
+    val_indices = shuffled_indices[train_count + test_count :].tolist()
+    # Create the subsets based on the indices
+    train_dataset = Subset(train_full_dataset, train_indices)
+    test_dataset = Subset(test_val_full_dataset, test_indices)
+    val_dataset = Subset(test_val_full_dataset, val_indices)
+    train_loader = DataLoader(
+        train_dataset,
+        batch_size=batch_size,
+        shuffle=True,
+        num_workers=num_workers,
+        pin_memory=True,
+    )
+    test_loader = DataLoader(
+        test_dataset,
+        batch_size=batch_size,
+        shuffle=False,
+        num_workers=num_workers,
+        pin_memory=True,
+    )
+    val_loader = DataLoader(
+        val_dataset,
+        batch_size=batch_size,
+        shuffle=False,
+        num_workers=num_workers,
+        pin_memory=True,
+    )
+    # Batch for visualization
+    vis_generator = torch.Generator().manual_seed(random_seed)
+    fixed_train_batch = next(
+        iter(DataLoader(train_dataset, batch_size=num_viz_samples, shuffle=True, generator=vis_generator))
+    )
+    # Since no shuffle, a generator is not needed
+    fixed_test_batch = next(iter(DataLoader(test_dataset, batch_size=num_viz_samples, shuffle=False)))
+    fixed_val_batch = next(iter(DataLoader(val_dataset, batch_size=num_viz_samples, shuffle=False)))
+    # Batch (dimensione 1) for attention map visualization
+    vis_generator.manual_seed(random_seed)
+    fixed_train_attention_batch = next(
+        iter(DataLoader(train_dataset, batch_size=1, shuffle=True, generator=vis_generator))
+    )
+    fixed_test_attention_batch = next(iter(DataLoader(test_dataset, batch_size=1, shuffle=False)))
+    fixed_val_attention_batch = next(iter(DataLoader(val_dataset, batch_size=1, shuffle=False)))
+    return {
+        'train_loader': train_loader,
+        'val_loader': val_loader,
+        'test_loader': test_loader,
+        'train_dataset': train_dataset,
+        'val_dataset': val_dataset,
+        'test_dataset': test_dataset,
+        'fixed_train_batch': fixed_train_batch,
+        'fixed_val_batch': fixed_val_batch,
+        'fixed_test_batch': fixed_test_batch,
+        'fixed_train_attention_batch': fixed_train_attention_batch,
+        'fixed_val_attention_batch': fixed_val_attention_batch,
+        'fixed_test_attention_batch': fixed_test_attention_batch,
+    }

pikapikagen/dataset.py ADDED Viewed

	@@ -0,0 +1,141 @@

+import os
+import urllib.request
+import zipfile
+from torch.utils.data import Dataset
+import pandas as pd
+from pathlib import Path
+import torchvision.transforms as transforms
+from PIL import Image
+from typing import TypedDict
+import torch
+class PokemonSample(TypedDict):
+    text: torch.Tensor  # Text already tokenized
+    image: torch.Tensor
+    description: str  # Text before tokenization
+    pokemon_name: str
+    idx: int
+    attention_mask: torch.Tensor
+def reporthook(block_num, block_size, total_size):
+    if block_num % 16384 == 0:
+        print(f"Downloading... {block_num * block_size / (1024 * 1024):.2f} MB")
+def download_dataset_if_not_exists():
+    dataset_dir = "dataset"
+    pokedex_main_dir = os.path.join(dataset_dir, "pokedex-main")
+    zip_url = "https://github.com/cristobalmitchell/pokedex/archive/refs/heads/main.zip"
+    zip_path = "pokedex_main.zip"
+    if os.path.exists(pokedex_main_dir):
+        print(f"{pokedex_main_dir} already exists. Skipping download.")
+        return
+    os.makedirs(dataset_dir, exist_ok=True)
+    print("Downloading dataset...")
+    urllib.request.urlretrieve(zip_url, zip_path, reporthook)
+    print("Download complete.")
+    print("Extracting dataset...")
+    with zipfile.ZipFile(zip_path, "r") as zip_ref:
+        zip_ref.extractall(dataset_dir)
+    print("Extraction complete.")
+    os.remove(zip_path)
+class PokemonDataset(Dataset):
+    def __init__(
+        self,
+        tokenizer,
+        csv_path="dataset/pokedex-main/data/pokemon.csv",
+        image_dir="dataset/pokedex-main/images/small_images",
+        max_length=128,
+        augmentation_transforms=None,
+    ):
+        self.df = pd.read_csv(csv_path, encoding="utf-16 LE", delimiter="\t")
+        self.image_dir = Path(image_dir)
+        print(f"Dataset caricato: {len(self.df)} Pokemon con descrizioni e immagini")
+        self.tokenizer = tokenizer
+        self.max_length = max_length
+        if augmentation_transforms is not None:
+            self.final_transform = transforms.Compose(
+                [
+                    transforms.ToTensor(),
+                    transforms.Resize((256, 256), antialias=True),
+                    augmentation_transforms,
+                    transforms.Normalize(
+                        mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]
+                    ),  # Normalizza a [-1, 1]
+                ]
+            )
+        else:
+            self.final_transform = transforms.Compose(
+                [
+                    transforms.ToTensor(),
+                    transforms.Resize((256, 256), antialias=True),
+                    transforms.Normalize(
+                        mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]
+                    ),  # Normalizza a [-1, 1]
+                ]
+            )
+    def __len__(self):
+        return len(self.df)
+    def __getitem__(self, idx: int) -> PokemonSample:
+        # Ottieni la riga corrispondente
+        row = self.df.iloc[idx]
+        # === PREPROCESSING DEL TESTO ===
+        description = str(row["description"])
+        # Tokenizza il testo
+        encoded = self.tokenizer(
+            description,
+            max_length=self.max_length,
+            padding="max_length",
+            truncation=True,
+            return_tensors="pt",
+        )
+        # Estrai token_ids e attention_mask
+        text_ids = encoded["input_ids"].squeeze(0)  # Rimuovi la dimensione batch
+        attention_mask = encoded["attention_mask"].squeeze(0)
+        # === CARICAMENTO E PREPROCESSING DELL'IMMAGINE ===
+        # Costruisce il percorso dell'immagine
+        image_filename = f"{row['national_number']:03d}.png"
+        image_path = self.image_dir / image_filename
+        # Carica l'immagine
+        image_rgba = Image.open(image_path).convert("RGBA")
+        # Gestisce la trasparenza: ricombina l'immagine con uno sfondo bianco
+        background = Image.new("RGB", image_rgba.size, (255, 255, 255))
+        background.paste(image_rgba, mask=image_rgba.split()[-1])
+        # Applica le trasformazioni finali (ToTensor, Resize, Normalize)
+        image_tensor = self.final_transform(background)
+        # Costruisce il risultato (matches pokemon_dataset.py structure)
+        sample = {
+            "text": text_ids,
+            "image": image_tensor,
+            "description": description,
+            "pokemon_name": row["english_name"],
+            "idx": idx,
+            "attention_mask": attention_mask,
+        }
+        return sample
+download_dataset_if_not_exists()
+print("Dataset ready!")

pikapikagen/discriminators.py ADDED Viewed

	@@ -0,0 +1,161 @@

+import torch
+import torch.nn as nn
+from model_blocks.text_encoder import TextEncoder
+class Discriminator256(nn.Module):
+    def __init__(self, text_dim=256, img_channels=3):
+        super(Discriminator256, self).__init__()
+        self.text_encoder = TextEncoder()  # Separate text encoder for discriminators
+        self.img_path = nn.Sequential(
+            # 256x256 -> 128x128
+            nn.Conv2d(img_channels, 16, 4, 2, 1, bias=False),
+            nn.LeakyReLU(0.2, inplace=True),
+            # 128x128 -> 64x64
+            nn.Conv2d(16, 32, 4, 2, 1, bias=False),
+            nn.BatchNorm2d(32),
+            nn.LeakyReLU(0.2, inplace=True),
+            # 64x64 -> 32x32
+            nn.Conv2d(32, 64, 4, 2, 1, bias=False),
+            nn.BatchNorm2d(64),
+            nn.LeakyReLU(0.2, inplace=True),
+            # 32x32 -> 16x16
+            nn.Conv2d(64, 128, 4, 2, 1, bias=False),
+            nn.BatchNorm2d(128),
+            nn.LeakyReLU(0.2, inplace=True),
+            # 16x16 -> 8x8
+            nn.Conv2d(128, 256, 4, 2, 1, bias=False),
+            nn.BatchNorm2d(256),
+            nn.LeakyReLU(0.2, inplace=True),
+            # 8x8 -> 4x4
+            nn.Conv2d(256, 512, 4, 2, 1, bias=False),
+            nn.BatchNorm2d(512),
+            nn.LeakyReLU(0.2, inplace=True),
+        )
+        self.text_path = nn.Sequential(
+            nn.Linear(text_dim, 1024),
+            nn.LeakyReLU(0.2, inplace=True),
+            nn.Linear(1024, 512)
+        )
+        # Unconditional classifier (real/fake without text conditioning)
+        self.unconditional_classifier = nn.Sequential(
+            nn.Linear(512 * 4 * 4, 1024),
+            nn.LeakyReLU(0.2, inplace=True),
+            nn.Dropout(0.5),
+            nn.Linear(1024, 1),
+        )
+        # Conditional classifier (text-conditioned real/fake)
+        self.conditional_classifier = nn.Sequential(
+            nn.Linear(512 * 4 * 4 + 512, 1024),  # size: sum of flattened image and text embedding
+            nn.LeakyReLU(0.2, inplace=True),
+            nn.Dropout(0.5),
+            nn.Linear(1024, 1),
+        )
+    def forward(self, images, text_features=None, text_mask=None, return_both=True):
+        # Encode image
+        img_features = self.img_path(images)
+        img_features_flat = img_features.view(img_features.size(0), -1)  # Flatten
+        unconditional_output = self.unconditional_classifier(img_features_flat)
+        if not return_both:
+            return unconditional_output
+        if text_features is None or text_mask is None:
+            raise AttributeError("text_features and text_mask necessary for text conditioning")
+        # Encode text (mean pooling)
+        global_full_text = self.text_encoder(text_features, text_mask)
+        global_text = global_full_text.mean(dim=1)
+        text_features_encoded = self.text_path(global_text)
+        # Combine features
+        combined = torch.cat([img_features_flat, text_features_encoded], dim=1)
+        conditional_output = self.conditional_classifier(combined)
+        return unconditional_output, conditional_output
+class Discriminator64(nn.Module):
+    def __init__(self, text_dim=256, img_channels=3):
+        super(Discriminator64, self).__init__()
+        self.text_encoder = TextEncoder()
+        self.img_path = nn.Sequential(
+            # 64x64 -> 32x32
+            nn.Conv2d(img_channels, 16, 4, 2, 1, bias=False),
+            nn.LeakyReLU(0.2, inplace=True),
+            # 32x32 -> 16x16
+            nn.Conv2d(16, 32, 4, 2, 1, bias=False),
+            nn.BatchNorm2d(32),
+            nn.LeakyReLU(0.2, inplace=True),
+            # 16x16 -> 8x8
+            nn.Conv2d(32, 64, 4, 2, 1, bias=False),
+            nn.BatchNorm2d(64),
+            nn.LeakyReLU(0.2, inplace=True),
+            # 8x8 -> 4x4
+            nn.Conv2d(64, 128, 4, 2, 1, bias=False),
+            nn.BatchNorm2d(128),
+            nn.LeakyReLU(0.2, inplace=True),
+        )
+        # Text encoder for discriminator
+        self.text_path = nn.Sequential(
+            nn.Linear(text_dim, 1024),
+            nn.LeakyReLU(0.2, inplace=True),
+            nn.Linear(1024, 512)
+        )
+        # Unconditional classifier (real/fake without text conditioning)
+        self.unconditional_classifier = nn.Sequential(
+            nn.Linear(128 * 4 * 4, 1024),
+            nn.LeakyReLU(0.2, inplace=True),
+            nn.Dropout(0.5),
+            nn.Linear(1024, 1),
+        )
+        # Conditional classifier (text-conditioned real/fake)
+        self.conditional_classifier = nn.Sequential(
+            nn.Linear(128 * 4 * 4 + 512, 1024),
+            nn.LeakyReLU(0.2, inplace=True),
+            nn.Dropout(0.5),
+            nn.Linear(1024, 1),
+        )
+    def forward(self, images, text_features=None, text_mask=None, return_both=True):
+        img_features = self.img_path(images)
+        img_features_flat = img_features.view(img_features.size(0), -1)  # Flatten
+        unconditional_output = self.unconditional_classifier(img_features_flat)
+        if not return_both:
+            return unconditional_output
+        if text_features is None or text_mask is None:
+            raise AttributeError("text_features and text_mask necessary for text conditioning")
+        # Encode text (mean pooling)
+        global_full_text = self.text_encoder(text_features, text_mask)
+        global_text = global_full_text.mean(dim=1)
+        text_features_encoded = self.text_path(global_text)
+        combined = torch.cat([img_features_flat, text_features_encoded], dim=1)
+        conditional_output = self.conditional_classifier(combined)
+        return unconditional_output, conditional_output

pikapikagen/evaluate_kid.py ADDED Viewed

	@@ -0,0 +1,141 @@

+import torch
+from transformers import AutoTokenizer
+from model import Generator as PikaPikaGen
+from data_loader import create_training_setup
+from utils import denormalize_image
+from torch_fidelity import calculate_metrics
+import os
+import tempfile
+from PIL import Image
+import shutil
+CHECKPOINT_PATH = "pikapikagen/model_checkpoint/checkpoint_epoch_150.pth"
+TOKENIZER_NAME = "prajjwal1/bert-mini"
+BATCH_SIZE = 16  # Batch size for generating images
+NUM_WORKERS = 2  # Number of workers for the data loader
+KID_SUBSET_SIZE = 50
+KID_NUM_SUBSETS = 20
+DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+class PokemonKIDEvaluator:
+    """Evaluator class for computing KID metrics on PikaPikaGen."""
+    def __init__(self, checkpoint_path, device=DEVICE):
+        self.device = device
+        self.tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME)
+        self.checkpoint_path = checkpoint_path
+        self._load_model()  # As in gradio demo
+    def _load_model(self):
+        self.generator = PikaPikaGen().to(self.device)
+        checkpoint = torch.load(self.checkpoint_path, map_location=self.device, weights_only=True)
+        self.generator.load_state_dict(checkpoint['generator_state_dict'])
+        self.generator.eval()
+    @staticmethod
+    def _tensor_to_pil(tensor: torch.Tensor) -> Image.Image:
+        denormalized = denormalize_image(tensor)
+        uint8_tensor = (denormalized * 255).clamp(0, 255).to(torch.uint8)
+        img_np = uint8_tensor.cpu().permute(1, 2, 0).numpy()
+        return Image.fromarray(img_np)
+    def _save_images_to_temp_dir(self, images_tensor: torch.Tensor, prefix: str) -> str:
+        """Save a batch of image tensors to a new temporary directory."""
+        temp_dir = tempfile.mkdtemp(prefix=f"pikakid_{prefix}_")
+        for i, img_tensor in enumerate(images_tensor):
+            pil_img = self._tensor_to_pil(img_tensor)
+            img_path = os.path.join(temp_dir, f"{i:06d}.png")
+            pil_img.save(img_path)
+        return temp_dir
+    def evaluate_kid(self, test_loader, resolution="256x256"):
+        all_real_images = []
+        all_generated_images = []
+        with torch.no_grad():
+            for batch in test_loader:
+                text_ids = batch["text"].to(self.device)
+                attention_mask = batch["attention_mask"].to(self.device)
+                real_images_256 = batch["image"]  # (B, 3, 256, 256)
+                generated_256, generated_64 = self.generator(text_ids, attention_mask)
+                # Select the correct resolution for both real and generated images
+                if resolution == "256x256":
+                    generated_images = generated_256
+                    processed_real_images = real_images_256
+                elif resolution == "64x64":
+                    generated_images = generated_64
+                    processed_real_images = torch.nn.functional.interpolate(
+                        real_images_256, size=(64, 64), mode='bilinear', align_corners=False
+                    )
+                else:
+                    raise ValueError(f"Unsupported resolution: {resolution}")
+                all_real_images.append(processed_real_images.cpu())
+                all_generated_images.append(generated_images.cpu())
+        # Combine all batches into single tensors
+        all_real_images = torch.cat(all_real_images, dim=0)
+        all_generated_images = torch.cat(all_generated_images, dim=0)
+        # Save images to temporary directories for torch-fidelity
+        real_temp_dir = self._save_images_to_temp_dir(all_real_images, "real")
+        generated_temp_dir = self._save_images_to_temp_dir(all_generated_images, "generated")
+        metrics = calculate_metrics(
+            input1=generated_temp_dir,      # Path to generated (fake) images
+            input2=real_temp_dir,           # Path to real images
+            kid=True,
+            kid_subset_size=KID_SUBSET_SIZE,
+            kid_subsets=KID_NUM_SUBSETS,
+            batch_size=BATCH_SIZE,
+            device=self.device
+        )
+        kid_mean = metrics['kernel_inception_distance_mean']
+        kid_std = metrics['kernel_inception_distance_std']
+        # Clean up the temporary directories
+        shutil.rmtree(real_temp_dir)
+        shutil.rmtree(generated_temp_dir)
+        return kid_mean, kid_std
+def main():
+    tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME)
+    training_setup = create_training_setup(
+        tokenizer=tokenizer,
+        test_set_size=0.2,
+        val_set_size=0.1,
+        batch_size=BATCH_SIZE,
+        num_workers=NUM_WORKERS,
+        random_seed=42, # Use a fixed seed for a reproducible split
+    )
+    test_loader = training_setup['test_loader']
+    test_set_size = len(test_loader.dataset)
+    evaluator = PokemonKIDEvaluator(checkpoint_path=CHECKPOINT_PATH)
+    resolutions_to_test = ['64x64', '256x256']
+    print(f"Checkpoint: {CHECKPOINT_PATH}")
+    print(f"Test samples: {test_set_size}")
+    print(f"KID Subset Size: {KID_SUBSET_SIZE}")
+    print(f"KID Subsets: {KID_NUM_SUBSETS}")
+    for res in resolutions_to_test:
+        kid_mean, kid_std = evaluator.evaluate_kid(test_loader, resolution=res)
+        print(f"Resolution {res}:\t KID = {kid_mean:.6f} ± {kid_std:.6f}")
+if __name__ == "__main__":
+    main()

pikapikagen/gradio_demo.py ADDED Viewed

	@@ -0,0 +1,291 @@

+import gradio as gr
+import gradio.themes
+import torch
+import numpy as np
+from PIL import Image
+from transformers import AutoTokenizer
+from model import Generator as PikaPikaGen
+from utils import denormalize_image
+from plots import plot_attention_visualization
+import os
+DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+CHECKPOINT_PATH = "model_checkpoints/checkpoint_epoch_150.pth"
+TOKENIZER_NAME = "prajjwal1/bert-mini"
+class PokemonGenerator:
+    """Main class for the Pokemon generation demo"""
+    def __init__(self):
+        self.device = DEVICE
+        self.tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME)
+        self._load_model()
+    def _load_model(self):
+        """Load the trained PikaPikaGen model"""
+        try:
+            # Initialize model
+            self.generator = PikaPikaGen().to(self.device)
+            # Load checkpoint
+            checkpoint = torch.load(CHECKPOINT_PATH, map_location=self.device, weights_only=True)
+            # Load saved weights into model
+            self.generator.load_state_dict(checkpoint['generator_state_dict'])
+            print(f"✅ Generator loaded from checkpoint (epoch {checkpoint.get('epoch', 'unknown')})")
+            # No training
+            self.generator.eval()
+        except Exception as e:
+            print(f"❌ Error loading model: {e}")
+            raise
+    def _tensor_to_pil(self, tensor):
+        """Convert tensor to PIL Image"""
+        # tensor shape: (3, H, W)
+        img_np = tensor.permute(1, 2, 0).clamp(0, 1).numpy()
+        img_np = (img_np * 255).astype(np.uint8)
+        return Image.fromarray(img_np)
+    def generate_pokemon(self, description, num_samples=4, show_attention=False, resolution="both"):
+        """
+        Generate Pokemon sprites from text description
+        Args:
+            description (str): Text description of the desired Pokemon
+            num_samples (int): Number of samples to generate (1-8)
+            show_attention (bool): Whether to show attention visualization
+            resolution (str): Output resolution - "256x256", "64x64", or "both"
+        Returns:
+            tuple: (generated_images, attention_plot)
+        """
+        if not description.strip():
+            return [], "❌ Please enter a description."
+        # No reason to compute gradients
+        with torch.no_grad():
+            tokens = self.tokenizer(
+                description,
+                max_length=128,
+                padding='max_length',
+                truncation=True,
+                return_tensors='pt'
+            )
+            text_ids = tokens['input_ids'].repeat(num_samples, 1).to(self.device)
+            attention_mask = tokens['attention_mask'].repeat(num_samples, 1).to(self.device)
+            generated_256, generated_64, attention_maps, initial_weights = self.generator(
+                text_ids, attention_mask, return_attentions=True
+            )
+            # Convert tensors to PIL images
+            output_images = []
+            images_to_process = []
+            if resolution in ["256x256", "both"]:
+                images_to_process.append(generated_256)
+            if resolution in ["64x64", "both"]:
+                images_to_process.append(generated_64)
+            for img_batch in images_to_process:
+                img_batch_denorm = denormalize_image(img_batch.cpu())
+                for i in range(num_samples):
+                    img_pil = self._tensor_to_pil(img_batch_denorm[i])
+                    output_images.append(img_pil)
+            attention_plot = None
+            if show_attention:
+                # Create directory if it doesn't exist
+                output_dir = "attention_visualizations"
+                os.makedirs(output_dir, exist_ok=True)
+                # Create a more descriptive ID for the file
+                # To avoid overwriting the same file with the same name
+                plot_id = description.strip().replace(" ", "_")[:30]
+                # Use the first sample for the attention visualization
+                attention_plot = plot_attention_visualization(
+                    epoch=0,
+                    set_name="demo",
+                    output_dir=output_dir,
+                    generated_images=generated_256,
+                    # Full batch data from the model
+                    decoder_attention_maps=attention_maps,
+                    initial_context_weights=initial_weights,
+                    token_ids=text_ids,
+                    attention_mask=attention_mask,
+                    tokenizer=self.tokenizer,
+                    # Metadata for the specific sample
+                    description=description,
+                    pokemon_id=plot_id,
+                    sample_idx=0,
+                    show_inline=False
+                )
+            return output_images, attention_plot
+print("Initializing PikaPikaGen Demo...")
+pokemon_gen = PokemonGenerator()
+def generate_pokemon_interface(description, num_samples, show_attention, resolution):
+    images, attention_plot = pokemon_gen.generate_pokemon(
+        description=description,
+        num_samples=num_samples,
+        show_attention=show_attention,
+        resolution=resolution
+    )
+    if images is None:
+        return [], attention_plot  # attention_plot contains error message if error
+    status_msg = f"Generated {len(images)} Pokemon sprites"
+    if resolution == "both":
+        status_msg += f" ({num_samples} at 256x256 + {num_samples} at 64x64)"
+    else:
+        status_msg += f" at {resolution}"
+    return images, attention_plot
+def create_interface():
+    with gr.Blocks(
+        title="PikaPikaGen: AI Pokemon Generator",
+        theme=gradio.themes.Soft(),
+        css="""
+        .main-header {
+            text-align: center;
+            background: linear-gradient(45deg, #ff6b6b, #4ecdc4);
+            -webkit-background-clip: text;
+            -webkit-text-fill-color: transparent;
+            font-size: 2.5em;
+            font-weight: bold;
+            margin-bottom: 0.5em;
+        }
+        .description {
+            text-align: center;
+            font-size: 1.1em;
+            color: #666;
+            margin-bottom: 1em;
+        }
+        """
+    ) as demo:
+        gr.HTML("""
+        <div class="main-header">🎮 PikaPikaGen: AI Pokemon Generator</div>
+        <div class="description">
+            Creation of Pokemon sprites from text descriptions using Transformer attention and CNN generation.
+        </div>
+        """)
+        with gr.Row():
+            with gr.Column(scale=1):
+                gr.Markdown("### 📝 Input")
+                description_input = gr.Textbox(
+                    label="Pokemon Description",
+                    placeholder="Describe your Pokemon! e.g., 'A fire dragon with golden scales and ruby eyes'",
+                    lines=3,
+                    value="A legendary fire dragon pokemon with golden scales and red eyes"
+                )
+                with gr.Row():
+                    num_samples = gr.Slider(
+                        minimum=1, maximum=8, value=4, step=1,
+                        label="Number of samples"
+                    )
+                    resolution = gr.Radio(
+                        choices=["256x256", "64x64", "both"],
+                        value="256x256",
+                        label="Output resolution"
+                    )
+                show_attention = gr.Checkbox(
+                    label="Show attention visualization",
+                    value=True,
+                    info="Visualize which words the model focuses on"
+                )
+                generate_btn = gr.Button(
+                    "🎨 Generate Pokemon!",
+                    variant="primary",
+                    size="lg"
+                )
+            with gr.Column(scale=2):
+                gr.Markdown("### 🎨 Generated Pokemon")
+                output_gallery = gr.Gallery(
+                    label="Generated Pokemon sprites",
+                    show_label=True,
+                    elem_id="gallery",
+                    columns=2,
+                    rows=2,
+                    height="auto",
+                    allow_preview=True
+                )
+                attention_output = gr.Image(
+                    label="Attention visualization",
+                    show_label=True,
+                    interactive=False
+                )
+        # Examples section
+        gr.Markdown("### 🌟 Examples to try")
+        gr.Examples(
+            examples=[
+                ["A fire dragon with golden scales and red eyes", 4, True, "256x256"],
+                ["An electric mouse with yellow fur and lightning bolts", 3, False, "both"],
+                ["A water turtle with blue shell and powerful jaws", 2, True, "256x256"],
+                ["A psychic cat with purple fur and mystical powers", 4, True, "256x256"],
+                ["A grass serpent with emerald scales and vine whips", 3, False, "64x64"],
+                ["An ice phoenix with crystal wings and frozen flames", 4, True, "256x256"],
+                ["A dark wolf with shadow abilities and glowing eyes", 2, True, "both"],
+                ["A steel robot pokemon with metallic armor and laser beams", 3, False, "256x256"]
+            ],
+            inputs=[description_input, num_samples, show_attention, resolution],
+            outputs=[output_gallery, attention_output],
+            fn=generate_pokemon_interface,
+            cache_examples=False
+        )
+        # Event handlers
+        generate_btn.click(
+            fn=generate_pokemon_interface,
+            inputs=[description_input, num_samples, show_attention, resolution],
+            outputs=[output_gallery, attention_output]
+        )
+        # Footer
+        gr.Markdown("""
+        ---
+        **PikaPikaGen** - Text-to-Image Pokemon Generation using Transformer + CNN
+        """)
+    return demo
+if __name__ == "__main__":
+    print("Starting PikaPikaGen Demo...")
+    # Create and launch interface
+    demo = create_interface()
+    demo.launch(
+        server_name="0.0.0.0",  # Allow external access
+        share=False,            # Set to True for public sharing
+        debug=False,
+        show_error=True,
+        inbrowser=True         # Auto-open browser
+    )

pikapikagen/losses.py ADDED Viewed

	@@ -0,0 +1,103 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torchvision import models
+from torchvision.models import VGG19_Weights
+class VGGPerceptualLoss(nn.Module):
+    """
+    Perceptual loss using VGG19 pretrained on ImageNet.
+    We extract features at:
+      - relu1_2  (index: 3)
+      - relu2_2  (index: 8)
+      - relu3_2  (index: 17)
+      - relu4_2  (index: 26)
+    Then compute L1 distance between those feature maps.
+    Input images are in [-1,1]. We convert to [0,1], then normalize with ImageNet stats.
+    """
+    def __init__(self, device):
+        super(VGGPerceptualLoss, self).__init__()
+        vgg19_features = models.vgg19(weights=VGG19_Weights.DEFAULT).features.to(device).eval()
+        # We only need layers up to 26 (relu4_2)
+        self.slices = nn.ModuleDict({
+            "relu1_2": nn.Sequential(*list(vgg19_features.children())[:4]),     # conv1_1, relu1_1, conv1_2, relu1_2
+            "relu2_2": nn.Sequential(*list(vgg19_features.children())[4:9]),    # pool1, conv2_1, relu2_1, conv2_2, relu2_2
+            "relu3_2": nn.Sequential(*list(vgg19_features.children())[9:18]),   # pool2, conv3_1, relu3_1, conv3_2, relu3_2, ...
+            "relu4_2": nn.Sequential(*list(vgg19_features.children())[18:27])   # pool3, conv4_1, relu4_1, conv4_2, relu4_2
+        })
+        for param in self.parameters():
+            param.requires_grad = False
+        self.l1 = nn.L1Loss()
+        self.register_buffer("mean", torch.tensor([0.485, 0.456, 0.406], device=device).view(1, 3, 1, 1))
+        self.register_buffer("std", torch.tensor([0.229, 0.224, 0.225], device=device).view(1, 3, 1, 1))
+    def forward(self, img_gen, img_ref):
+        """
+        img_gen, img_ref: [B,3,H,W] in range [-1,1].
+        Return: sum of L1 distances between VGG feature maps at chosen layers.
+        """
+        # Convert to [0,1]
+        gen = (img_gen + 1.0) / 2.0
+        ref = (img_ref + 1.0) / 2.0
+        # Normalize
+        gen_norm = (gen - self.mean) / self.std
+        ref_norm = (ref - self.mean) / self.std
+        loss = 0.0
+        x_gen = gen_norm
+        x_ref = ref_norm
+        for slice_mod in self.slices.values():
+            x_gen = slice_mod(x_gen)
+            x_ref = slice_mod(x_ref)
+            loss += self.l1(x_gen, x_ref)
+        return loss
+class SobelLoss(nn.Module):
+    """
+    Computes the Sobel loss between two images, which encourages edge similarity.
+    This loss operates on the grayscale versions of the input images.
+    """
+    def __init__(self):
+        super(SobelLoss, self).__init__()
+        # Sobel kernels for edge detection
+        self.kernel_x = torch.tensor([[-1, 0, 1], [-2, 0, 2], [-1, 0, 1]], dtype=torch.float32).view(1, 1, 3, 3)
+        self.kernel_y = torch.tensor([[-1, -2, -1], [0, 0, 0], [1, 2, 1]], dtype=torch.float32).view(1, 1, 3, 3)
+        self.l1 = nn.L1Loss()
+        # Grayscale conversion weights (ITU-R BT.601)
+        self.rgb_to_gray_weights = torch.tensor([0.299, 0.587, 0.114]).view(1, 3, 1, 1)
+    def _get_edges(self, img):
+        """
+        Converts an RGB image to grayscale and applies Sobel filters.
+        Args:
+            img: [B, 3, H, W] image tensor in range [-1, 1].
+        Returns:
+            Gradient magnitude map [B, 1, H, W].
+        """
+        # Convert from [-1, 1] to [0, 1]
+        img = (img + 1.0) / 2.0
+        # Convert to grayscale
+        grayscale_img = F.conv2d(img, self.rgb_to_gray_weights.to(img.device))
+        # Apply Sobel filters
+        grad_x = F.conv2d(grayscale_img, self.kernel_x.to(img.device), padding=1)
+        grad_y = F.conv2d(grayscale_img, self.kernel_y.to(img.device), padding=1)
+        # Compute gradient magnitude
+        edges = torch.sqrt(grad_x**2 + grad_y**2 + 1e-6) # add epsilon for stability
+        return edges
+    def forward(self, img_gen, img_ref):
+        """
+        img_gen, img_ref: [B, 3, H, W] in range [-1, 1].
+        Returns: L1 loss between the edge maps of the two images.
+        """
+        edges_gen = self._get_edges(img_gen)
+        edges_ref = self._get_edges(img_ref)
+        return self.l1(edges_gen, edges_ref)

pikapikagen/model.py ADDED Viewed

	@@ -0,0 +1,46 @@

+import torch
+import torch.nn as nn
+from model_blocks.text_encoder import TextEncoder
+from model_blocks.image_decoder import ImageDecoder
+class Generator(nn.Module):
+    """
+    Modello completo che unisce Encoder e Decoder.
+    """
+    def __init__(self, text_encoder_model_name="prajjwal1/bert-mini", noise_dim=100):
+        super().__init__()
+        self.text_encoder = TextEncoder(
+            model_name=text_encoder_model_name,
+        )
+        text_embed_dim = 256
+        self.image_decoder = ImageDecoder(
+            noise_dim=noise_dim,
+            text_embed_dim=text_embed_dim
+        )
+        self.noise_dim = noise_dim
+    def forward(self, token_ids, attention_mask, return_attentions=False):
+        # token_ids.shape: (batch_size, seq_len)
+        # attention_mask.shape: (batch_size, seq_len)
+        # Genera rumore casuale per il batch
+        batch_size = token_ids.size(0)
+        # noise.shape: (batch_size, noise_dim)
+        noise = torch.randn(batch_size, self.noise_dim, device=token_ids.device)
+        # 1. Codifica il testo per ottenere i vettori di ogni parola
+        # encoder_output.shape: (batch_size, seq_len, text_embed_dim)
+        encoder_output = self.text_encoder(token_ids, attention_mask=attention_mask)
+        # 2. Genera l'immagine usando l'output completo dell'encoder
+        #    Il decoder calcolerà internamente sia il contesto iniziale (ATTENZIONE #1)
+        #    sia l'attenzione per-step (ATTENZIONE #2)
+        # generated_image_256.shape: (batch_size, 3, 256, 256)
+        # generated_image_64.shape: (batch_size, 3, 64, 64)
+        generated_image_256, generated_image_64, attention_maps, initial_attention_weights = self.image_decoder(noise, encoder_output, attention_mask)
+        if return_attentions:
+            return generated_image_256, generated_image_64, attention_maps, initial_attention_weights
+        return generated_image_256, generated_image_64

pikapikagen/model_blocks/decoder_block.py ADDED Viewed

	@@ -0,0 +1,59 @@

+import torch
+import torch.nn as nn
+from model_blocks.image_cross_attention import ImageCrossAttention
+class DecoderBlock(nn.Module):
+    """
+    Image decoder block
+    Channel adaptation (if necessary) -> Attention (optional) -> Merge -> Residual connection
+    -> Upsampling (ConvTranspose) -> Normalization -> Activation.
+    """
+    def __init__(self, in_channels, out_channels, use_attention=True, text_embed_dim=256, nhead=4):
+        super().__init__()
+        self.use_attention = use_attention
+        if self.use_attention:
+            # If in_channels is different from text_embed_dim, add a 1x1 conv to adapt the channel size
+            if in_channels != text_embed_dim:
+                self.channel_adapter = nn.Conv2d(in_channels, text_embed_dim, kernel_size=1, bias=False)
+            else:
+                self.channel_adapter = None
+            self.cross_attention = ImageCrossAttention(embed_dim=text_embed_dim, num_heads=nhead)
+            # Convolution to merge the text_embedding and the cross-attention output
+            self.fusion_conv = nn.Conv2d(text_embed_dim * 2, in_channels, kernel_size=1, bias=False)
+        # Upsample block as described in the instructions
+        self.upsample_block = nn.Sequential(
+            nn.ConvTranspose2d(in_channels, out_channels, kernel_size=4, stride=2, padding=1, bias=False),
+            nn.GroupNorm(1, out_channels),
+            nn.LeakyReLU(inplace=True)
+        )
+    def forward(self, x, encoder_output=None, attention_mask=None):
+        attn_weights = None
+        if self.use_attention:
+            if encoder_output is None or attention_mask is None:
+                raise ValueError("encoder_output and attention_mask must be provided for attention.")
+            # Adapt channel size if deemed necessary
+            if self.channel_adapter is not None:
+                x_adapted = self.channel_adapter(x)
+            else:
+                x_adapted = x
+            attn_output, attn_weights = self.cross_attention(
+                image_features=x_adapted,
+                text_features=encoder_output,
+                key_padding_mask=attention_mask
+            )
+            # Concatenates the features with the cross-attention output,
+            # then conv 1x1 and residual connection
+            fused_features = torch.cat([x_adapted, attn_output], dim=1) # Shape: (B, 2*in_channels, H, W)
+            skip = self.fusion_conv(fused_features) # Shape: (B, in_channels, H, W)
+            x = x + skip  # Shape: (B, in_channels, H, W)
+        x = self.upsample_block(x)
+        return x, attn_weights

pikapikagen/model_blocks/image_cross_attention.py ADDED Viewed

	@@ -0,0 +1,49 @@

+import torch.nn as nn
+class ImageCrossAttention(nn.Module):
+    """
+    Image cross-attention module
+    Allows a sequence of queries (from the image) to "pay attention"
+    to a sequence of key/value (from the text), internally managing
+    the reshaping of tensors and the attention mask.
+    """
+    def __init__(self, embed_dim, num_heads):
+        super().__init__()
+        self.attention = nn.MultiheadAttention(
+            embed_dim=embed_dim, num_heads=num_heads, batch_first=True
+        )
+        self.layer_norm = nn.LayerNorm(embed_dim)
+    def forward(self, image_features, text_features, key_padding_mask=None):
+        # query: (B, C, H, W) - Image features
+        # key/value: (B, seq_len, embed_dim) - Text encoder output
+        # key_padding_mask: (B, seq_len) - Attention mask from the tokenizer
+        B, C, H, W = image_features.shape
+        # Reshape from image to sequence: (B, C, H, W) -> (B, H*W, C)
+        query_seq = image_features.view(B, C, H * W).permute(0, 2, 1)
+        query_norm = self.layer_norm(query_seq)
+        # Prepare the padding mask from the attention mask
+        # The HuggingFace mask is 1 for real tokens, 0 for padding.
+        # MultiheadAttention expects True for positions to ignore.
+        if key_padding_mask is not None:
+            mask = (key_padding_mask == 0)
+        else:
+            mask = None
+        attn_output, attn_weights = self.attention(
+            query=query_norm,
+            key=text_features,
+            value=text_features,
+            key_padding_mask=mask,
+            need_weights=True
+        )
+        # attn_output: (B, H*W, C)
+        # Convert output back into its original size
+        # (B, H*W, C) -> (B, C, H*W) -> (B, C, H, W)
+        attn_output_spatial = attn_output.permute(0, 2, 1).view(B, C, H, W)
+        return attn_output_spatial, attn_weights

pikapikagen/model_blocks/image_decoder.py ADDED Viewed

	@@ -0,0 +1,122 @@

+import torch
+import torch.nn as nn
+from model_blocks.decoder_block import DecoderBlock
+class ImageDecoder(nn.Module):
+    """
+    Decoder CNN (Generatore) che sintetizza l'immagine.
+    Questa versione usa l'attenzione per-step fin dall'inizio.
+    """
+    def __init__(self, noise_dim, text_embed_dim, final_image_channels=3):
+        super().__init__()
+        # Mechanism to calculate attention scores for the initial context.
+        self.initial_context_scorer = nn.Sequential(
+            nn.Linear(in_features=text_embed_dim, out_features=512),
+            nn.Tanh(),
+            nn.Linear(in_features=512, out_features=1)
+            # Softmax applied in forward pass to use the attention mask
+        )
+        # Initial linear projection to a 4x4 feature map.
+        self.initial_projection = nn.Sequential(
+            nn.Linear(noise_dim + text_embed_dim, 256 * 4 * 4),
+            nn.GroupNorm(1, 256 * 4 * 4),
+            nn.LeakyReLU(inplace=True)
+        )
+        # Shared blocks for both resolutions (until 64x64)
+        self.blocks_64 = nn.ModuleList([
+            # Input: (B, 256, 4, 4)   -> Output: (B, 256, 8, 8)
+            DecoderBlock(in_channels=256, out_channels=256, use_attention=True),
+            # Input: (B, 256, 8, 8)   -> Output: (B, 256, 16, 16)
+            DecoderBlock(in_channels=256, out_channels=256, use_attention=True),
+            # Input: (B, 256, 16, 16)  -> Output: (B, 128, 32, 32)
+            DecoderBlock(in_channels=256, out_channels=128, use_attention=True),
+            # Input: (B, 128, 32, 32)  -> Output: (B, 64, 64, 64)
+            DecoderBlock(in_channels=128, out_channels=64, use_attention=False),
+        ])
+        # ModuleList is used instead of a Sequential for example because
+        # of the branching based on use_attention in the forward pass
+        # Blocks only for 256x256 (from 64x64 to 256x256)
+        self.blocks_256 = nn.ModuleList([
+            # Input: (B, 64, 64, 64)  -> Output: (B, 32, 128, 128)
+            DecoderBlock(in_channels=64, out_channels=32, use_attention=True),
+            # Input: (B, 32, 128, 128) -> Output: (B, 16, 256, 256)
+            DecoderBlock(in_channels=32, out_channels=16, use_attention=False),
+        ])
+        # Last layer to get to RGB channels - 256x256
+        # Input: (B, 16, 256, 256) -> Output: (B, 3, 256, 256)
+        self.final_conv_256 = nn.Conv2d(16, final_image_channels, kernel_size=3, padding=1)
+        self.final_activation_256 = nn.Tanh()
+        # Last layer to get to RGB channels - 64x64
+        # Input: (B, 64, 64, 64) -> Output: (B, 3, 64, 64)
+        self.final_conv_64 = nn.Conv2d(64, final_image_channels, kernel_size=3, padding=1)
+        self.final_activation_64 = nn.Tanh()
+    def forward(self, noise, encoder_output_full, attention_mask):
+        # noise.shape: (B, noise_dim)
+        # encoder_output_full.shape: (B, seq_len, text_embed_dim)
+        # attention_mask.shape: (B, seq_len)
+        # 1. Compute the first attention, with the scores (logits) for each token
+        attn_scores = self.initial_context_scorer(encoder_output_full)
+        # Apply attention mask before Softmax.
+        # Set the scores of the padding tokens, where attention mask is 0, to -inf.
+        # The mask is (B, seq_len), the scores (B, seq_len, 1)
+        # The unsqueeze takes care of the dimension diference.
+        attn_scores.masked_fill_(attention_mask.unsqueeze(-1) == 0, -1e9)
+        # attention_weights.shape: (B, seq_len, 1)
+        attention_weights = torch.softmax(attn_scores, dim=1)
+        # Weighted average of the encoder output
+        # context_vector.shape: (B, text_embed_dim)
+        context_vector = torch.sum(attention_weights * encoder_output_full, dim=1)
+        # 2. Merge the noise and the context vector for the initial projection
+        # initial_input.shape: (B, noise_dim + text_embed_dim)
+        initial_input = torch.cat([noise, context_vector], dim=1)
+        # 3. Initial projection and reshape to fit the transposed convolutions
+        # x.shape: (B, 256 * 4 * 4)
+        x = self.initial_projection(initial_input)
+        # x.shape: (B, 256, 4, 4)
+        x = x.view(x.size(0), 256, 4, 4)
+        # 4. Pass through the encoder blocks
+        attention_maps = []
+        # Shared path for both resolutions (fino a 64x64)
+        for block in self.blocks_64:
+            encoder_ctx = encoder_output_full if block.use_attention else None
+            mask_ctx = attention_mask if block.use_attention else None
+            x, attn_weights = block(x, encoder_ctx, mask_ctx)
+            if attn_weights is not None:
+                attention_maps.append(attn_weights)
+        # Now x has size (B, 64, 64, 64)
+        # 64x64-only path
+        image_64 = self.final_conv_64(x)
+        image_64 = self.final_activation_64(image_64)
+        # 5. 256x256-only path
+        for block in self.blocks_256:
+            encoder_ctx = encoder_output_full if block.use_attention else None
+            mask_ctx = attention_mask if block.use_attention else None
+            x, attn_weights = block(x, encoder_ctx, mask_ctx)
+            if attn_weights is not None:
+                attention_maps.append(attn_weights)
+        # Final layer for 256x256
+        # x_256.shape: (B, 16, 256, 256) -> (B, 3, 256, 256)
+        image_256 = self.final_conv_256(x)
+        image_256 = self.final_activation_256(image_256)
+        return image_256, image_64, attention_maps, attention_weights

pikapikagen/model_blocks/text_encoder.py ADDED Viewed

	@@ -0,0 +1,43 @@

+import torch.nn as nn
+from transformers import AutoModel
+class TextEncoder(nn.Module):
+    """
+    Text encoder
+    Uses bert-mini embeddings and passes them through a Transformer.
+    """
+    def __init__(self, model_name="prajjwal1/bert-mini", fine_tune_embeddings=True):
+        super().__init__()
+        # Load the pre-trained bert-mini model for embeddings
+        bert_mini_model = AutoModel.from_pretrained(model_name)
+        self.embedding = bert_mini_model.embeddings
+        # Set whether to fine-tune the embeddings during training
+        for param in self.embedding.parameters():
+            param.requires_grad = fine_tune_embeddings
+        encoder_layer = nn.TransformerEncoderLayer(
+            d_model=256, nhead=4, dim_feedforward=1024, batch_first=True
+        )
+        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=4)
+    def forward(self, token_ids, attention_mask=None):
+        # Get the embeddings from the tokens
+        # Shape: (batch_size, seq_len) -> (batch_size, seq_len, embedding_dim)
+        embedded_text = self.embedding(token_ids)
+        # Prepare the padding mask for TransformerEncoder
+        # The HuggingFace mask is 1 for real tokens, 0 for padding.
+        # TransformerEncoder expects True for positions to ignore (padding).
+        src_key_padding_mask = None
+        if attention_mask is not None:
+            src_key_padding_mask = (attention_mask == 0)
+        # Pass the embeddings through the Transformer Encoder with the mask
+        # Shape: (batch_size, seq_len, embedding_dim) -> (batch_size, seq_len, embedding_dim)
+        encoder_output = self.transformer_encoder(
+            src=embedded_text,
+            src_key_padding_mask=src_key_padding_mask
+        )
+        return encoder_output

pikapikagen/model_checkpoint/checkpoint_epoch_150.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7902ac75581c4a54ec5345ccf2bd30440a99a4c1031b5c12af6cabb318dde225
+size 789795998

pikapikagen/plots.py ADDED Viewed

	@@ -0,0 +1,428 @@

+import torch
+import numpy as np
+import matplotlib.pyplot as plt
+import os
+import io
+from PIL import Image
+from utils import denormalize_image
+import torch.nn.functional as F
+from transformers import AutoTokenizer
+def save_attention_visualization(
+    epoch, model, tokenizer, batch, device, set_name, output_dir, show_inline=False
+):
+    print(f"Epoch {epoch}: Generating attention visualization for {set_name} set...")
+    attention_data = generate_attention_data(model, tokenizer, batch, device)
+    if attention_data:
+        plot_attention_visualization(
+            epoch=epoch,
+            set_name=set_name,
+            output_dir=output_dir,
+            show_inline=show_inline,
+            **attention_data,
+        )
+        print(f"Epoch {epoch}: Attention visualization saved for Pokémon #{attention_data['pokemon_id']}.")
+    else:
+        print(f"Epoch {epoch}: Skipped attention visualization due to missing data.")
+def generate_attention_data(model, tokenizer, batch, device):
+    """
+    Runs the model to generate the image and attention maps, filtering the padding tokens.
+    """
+    model.eval()
+    with torch.no_grad():
+        token_ids = batch["text"].to(device)
+        attention_mask = batch["attention_mask"].to(device)
+        # Ensure batch size is 1 for visualization
+        if token_ids.dim() > 1:
+            token_ids = token_ids[0].unsqueeze(0)
+            attention_mask = attention_mask[0].unsqueeze(0)
+        # Get the first sample from the batch
+        pokemon_id = batch["idx"][0]
+        description = batch["description"][0]
+        generated_image, attention_maps, initial_context_weights = model(
+            token_ids, attention_mask, return_attentions=True
+        )
+    decoder_attention_maps = [m for m in attention_maps if m is not None]
+    if not decoder_attention_maps or initial_context_weights is None:
+        print("Attention maps not available. Skipping data generation.")
+        return None
+    # Extract valid tokens to display
+    tokens_all = tokenizer.convert_ids_to_tokens(token_ids.squeeze(0))
+    display_tokens = []
+    for i, token in enumerate(tokens_all):
+        if (
+            token not in [tokenizer.sep_token, tokenizer.pad_token]
+            and attention_mask[0, i] == 1
+        ):
+            display_tokens.append({"token": token, "index": i})
+    if not display_tokens:
+        print(f"No valid tokens to display for '{description}'. Skipping.")
+        return None
+    return {
+        "generated_image": generated_image.cpu(),
+        "decoder_attention_maps": [m.cpu() for m in decoder_attention_maps],
+        "initial_context_weights": initial_context_weights.cpu(),
+        "display_tokens": display_tokens,
+        "description": description,
+        "pokemon_id": pokemon_id,
+    }
+def plot_attention_visualization(
+    # Plot identification arguments
+    epoch: int,
+    set_name: str,
+    output_dir: str | None,
+    # Data generated by the model (can be full batches)
+    generated_images: torch.Tensor,
+    decoder_attention_maps: list[torch.Tensor],
+    initial_context_weights: torch.Tensor,
+    # Original text input (can be a full batch)
+    token_ids: torch.Tensor,
+    attention_mask: torch.Tensor,
+    tokenizer: AutoTokenizer,
+    # Batch metadata (for the specific sample)
+    description: str,
+    pokemon_id: int | str,
+    # Control options
+    sample_idx: int = 0,
+    show_inline: bool = False,
+):
+    """
+    Generates and saves an attention visualization for a single sample from a batch.
+    This function is self-contained: it accepts full batch tensors and internally
+    handles sample selection and token preparation.
+    Args:
+        epoch (int): Epoch number (for title/filename).
+        set_name (str): Set name (e.g., 'train', for title/filename).
+        output_dir (str, optional): Folder to save the image. If None, the plot is not saved.
+        generated_images (torch.Tensor): Tensor of generated images.
+            Shape: (B, C, H, W).
+        decoder_attention_maps (list[torch.Tensor]): List of attention tensors.
+            Each tensor shape: (B, num_patches, seq_length).
+        initial_context_weights (torch.Tensor): Initial attention weights.
+            Shape: (B, 1, seq_length).
+        token_ids (torch.Tensor): Input token.
+            Shape: (B, seq_length).
+        attention_mask (torch.Tensor): Attention mask for tokens.
+            Shape: (B, seq_length).
+        tokenizer: The tokenizer object for id -> token conversion.
+        description (str): The text prompt for the selected sample.
+        pokemon_id (int or str): The ID of the selected sample.
+        sample_idx (int, optional): Index of the sample in the batch to visualize.
+            Defaults to 0.
+        show_inline (bool, optional): If True, shows the plot. Defaults to False.
+    """
+    # Select the specific sample using sample_idx and move to CPU
+    img_tensor = generated_images[sample_idx].cpu()
+    layer_maps = [m[sample_idx].cpu() for m in decoder_attention_maps if m is not None]
+    initial_weights = initial_context_weights[sample_idx].cpu()
+    token_ids_sample = token_ids[sample_idx].cpu()
+    attention_mask_sample = attention_mask[sample_idx].cpu()
+    # Token filtering logic
+    tokens_all = tokenizer.convert_ids_to_tokens(token_ids_sample)
+    display_tokens = []
+    for i, token in enumerate(tokens_all):
+        if (
+            token not in [tokenizer.sep_token, tokenizer.pad_token]
+            and attention_mask_sample[i] == 1
+        ):
+            display_tokens.append({"token": token, "index": i})
+    img_tensor_cpu = denormalize_image(img_tensor).permute(1, 2, 0)
+    num_decoder_layers = len(layer_maps)
+    num_tokens = len(display_tokens)
+    token_indices_to_display = [t["index"] for t in display_tokens]
+    cols = min(num_tokens, 8)
+    rows_per_layer = (num_tokens + cols - 1) // cols
+    height_ratios = [3, 2] + [2 * rows_per_layer] * num_decoder_layers
+    fig_height = sum(height_ratios)
+    fig_width = max(20, 2.5 * cols)
+    fig = plt.figure(figsize=(fig_width, fig_height))
+    gs_main = fig.add_gridspec(len(height_ratios), 1, height_ratios=height_ratios, hspace=1.2)
+    fig.suptitle(f"Epoch {epoch}: Attention for Pokémon #{pokemon_id} ({set_name.capitalize()})", fontsize=24)
+    ax_main_img = fig.add_subplot(gs_main[0])
+    ax_main_img.imshow(img_tensor_cpu)
+    ax_main_img.set_title("Generated Image", fontsize=18)
+    ax_main_img.text(0.5, -0.1, f"Prompt: {description}", ha="center", va="top",
+                      transform=ax_main_img.transAxes, fontsize=14, wrap=True)
+    ax_main_img.axis("off")
+    ax_initial_attn = fig.add_subplot(gs_main[1])
+    initial_weights_squeezed = initial_weights.squeeze().numpy()
+    token_strings = [t["token"] for t in display_tokens]
+    relevant_weights = initial_weights_squeezed[[t["index"] for t in display_tokens]]
+    ax_initial_attn.bar(np.arange(len(token_strings)), relevant_weights, color="skyblue")
+    ax_initial_attn.set_xticks(np.arange(len(token_strings)))
+    ax_initial_attn.set_xticklabels(token_strings, rotation=45, ha="right", fontsize=10)
+    ax_initial_attn.set_title("Initial Context Attention (Global)", fontsize=16)
+    ax_initial_attn.set_ylabel("Weight", fontsize=12)
+    ax_initial_attn.grid(axis="y", linestyle="--", alpha=0.7)
+    # Iterate through each decoder layer's attention maps
+    for i, layer_attn_map in enumerate(layer_maps):
+        # layer_attn_map shape is now (num_patches, seq_len)
+        map_size_flat = layer_attn_map.shape[0]
+        map_side = int(np.sqrt(map_size_flat))
+        layer_title = f"Decoder Cross-Attention Layer {i+1} (Size: {map_side}x{map_side})"
+        # Extract attention weights only for tokens we want to display
+        relevant_attn_maps = layer_attn_map[:, token_indices_to_display]
+        vmin, vmax = relevant_attn_maps.min(), relevant_attn_maps.max()
+        # Create subplot grid for this layer
+        gs_layer = gs_main[2 + i].subgridspec(rows_per_layer, cols + 1, wspace=0.2, hspace=0.4, width_ratios=[*([1] * cols), 0.1])
+        axes_in_layer = [fig.add_subplot(gs_layer[r, c]) for r in range(rows_per_layer) for c in range(cols)]
+        # Add layer title above the token attention maps
+        if axes_in_layer:
+            y_pos = axes_in_layer[0].get_position().y1
+            fig.text(0.5, y_pos + 0.01, layer_title, ha="center", va="bottom", fontsize=16, weight="bold")
+        # Plot attention heatmap for each token
+        im = None
+        for j, token_info in enumerate(display_tokens):
+            if j >= len(axes_in_layer):
+                break
+            ax = axes_in_layer[j]
+            attn_for_token = layer_attn_map[:, token_info["index"]]
+            # Reshape flat attention to spatial grid
+            heatmap = attn_for_token.reshape(map_side, map_side)
+            im = ax.imshow(heatmap, cmap="jet", interpolation="nearest", vmin=vmin, vmax=vmax)
+            ax.set_title(f"'{token_info['token']}'", fontsize=12)
+            ax.axis("off")
+        # Add colorbar for the layer
+        if im:
+            cax = fig.add_subplot(gs_layer[:, -1])
+            cbar = fig.colorbar(im, cax=cax)
+            cbar.ax.tick_params(labelsize=10)
+            cbar.set_label("Attention Weight", rotation=270, labelpad=15, fontsize=12)
+        # Hide unused subplots
+        for j in range(num_tokens, len(axes_in_layer)):
+            axes_in_layer[j].axis("off")
+    plt.tight_layout(rect=(0, 0.03, 1, 0.96))
+    if output_dir is not None:
+        save_path = os.path.join(output_dir, f"{epoch:03d}_{set_name}_attention_visualization_{pokemon_id}.png")
+        plt.savefig(save_path, bbox_inches="tight")
+    # Save figure to bytes for potential further use (e.g., logging)
+    buf = io.BytesIO()
+    fig.savefig(buf, format='png', bbox_inches='tight', dpi=150)
+    buf.seek(0)
+    # Convert to PIL image
+    attention_plot = Image.open(buf)
+    if show_inline:
+        plt.show()
+    plt.close(fig)
+    return attention_plot
+def save_plot_losses(losses_g, losses_d, output_dir="training_output", show_inline=True):
+    """
+    Generates and saves a plot of the generator and discriminator losses.
+    """
+    os.makedirs(output_dir, exist_ok=True)
+    fig, ax = plt.subplots(figsize=(12, 6))
+    ax.plot(losses_g, label="Generator Loss", color="blue")
+    ax.plot(losses_d, label="Discriminator Loss", color="red")
+    ax.set_title("Training Losses")
+    ax.set_xlabel("Epochs")
+    ax.set_ylabel("Loss")
+    ax.legend()
+    ax.grid(True)
+    save_path = os.path.join(output_dir, "training_losses.png")
+    plt.savefig(save_path)
+    print(f"Loss plot saved to: {save_path}")
+    if show_inline:
+        plt.show()
+    else:
+        plt.close(fig)
+def save_plot_non_gan_losses(train_losses_history, val_losses_history, output_dir="training_output", show_inline=True, filter_losses=None):
+    """
+    Generates and saves plots of losses for non-GAN models with multiple loss components.
+    Args:
+        train_losses_history (list[dict]): List of dicts containing training losses per epoch.
+                                           e.g., [{'l1': 0.5, 'sobel': 0.3}, ...]
+        val_losses_history (list[dict]): List of dicts containing validation losses per epoch.
+        output_dir (str): Directory to save the plot.
+        show_inline (bool): Whether to display the plot inline.
+        filter_losses (list[str], optional): List of loss names to plot.
+                                             If None, plots all found losses.
+    """
+    os.makedirs(output_dir, exist_ok=True)
+    # Extract all unique loss keys from both training and validation
+    all_keys = set()
+    for losses_dict in train_losses_history + val_losses_history:
+        all_keys.update(losses_dict.keys())
+    # Filter out non-numeric keys if any
+    loss_keys = [key for key in all_keys if key not in ['epoch']]
+    # Apply filter if specified
+    if filter_losses is not None:
+        loss_keys = [key for key in loss_keys if key in filter_losses]
+    loss_keys = sorted(loss_keys)  # Sort for consistent ordering
+    # Create subplots
+    n_losses = len(loss_keys)
+    cols = min(3, n_losses)  # Max 3 columns
+    rows = (n_losses + cols - 1) // cols  # Ceiling division
+    fig, axes = plt.subplots(rows, cols, figsize=(5 * cols, 4 * rows))
+    if n_losses == 1:
+        axes = [axes]
+    elif rows > 1:
+        axes = axes.flatten()
+    fig.suptitle("Training and Validation Losses", fontsize=16, y=0.98)
+    for i, loss_key in enumerate(loss_keys):
+        ax = axes[i]
+        # Extract train and validation losses for this key
+        train_values = [losses.get(loss_key, 0) for losses in train_losses_history]
+        val_values = [losses.get(loss_key, 0) for losses in val_losses_history]
+        epochs_train = range(1, len(train_values) + 1)
+        epochs_val = range(1, len(val_values) + 1)
+        # Plot training and validation curves
+        if train_values:
+            ax.plot(epochs_train, train_values, label=f"Train {loss_key}", color="blue", linewidth=1.5)
+        if val_values:
+            ax.plot(epochs_val, val_values, label=f"Val {loss_key}", color="red", linewidth=1.5, linestyle='--')
+        ax.set_title(f"{loss_key.capitalize()} Loss", fontsize=12)
+        ax.set_xlabel("Epoch")
+        ax.set_ylabel("Loss")
+        ax.legend()
+        ax.grid(True, alpha=0.3)
+        ax.set_ylim(bottom=0)
+    # Hide unused subplots
+    for i in range(n_losses, len(axes)):
+        axes[i].set_visible(False)
+    plt.tight_layout()
+    # Save the plot
+    save_path = os.path.join(output_dir, "non_gan_training_losses.png")
+    plt.savefig(save_path, dpi=150, bbox_inches='tight')
+    print(f"Non-GAN training losses plot saved to: {save_path}")
+    if show_inline:
+        plt.show()
+    else:
+        plt.close(fig)
+def save_comparison_grid(epoch, model, batch, set_name, device, output_dir="training_output", show_inline=True):
+    """
+    Generates and saves/shows a horizontal comparison grid (real vs. generated).
+    Automatically handles 256x256 or 64x64 output based on set_name.
+    """
+    os.makedirs(output_dir, exist_ok=True)
+    model.eval()
+    token_ids = batch["text"].to(device)
+    attention_mask = batch["attention_mask"].to(device)
+    real_images = batch["image"]
+    pokemon_ids = batch["idx"]
+    descriptions = batch["description"]
+    num_images = real_images.size(0)
+    with torch.no_grad():
+        generated_images = model(token_ids, attention_mask)
+        # Handle tuple output from generator (e.g., 256px and 64px images)
+        if isinstance(generated_images, tuple):
+            # Check if we want 64x64 or 256x256 based on set_name
+            if "64" in set_name:
+                generated_images = generated_images[1]  # Use 64x64 output
+                # Resize real images to 64x64 for comparison
+                real_images = F.interpolate(real_images, size=(64, 64), mode='bilinear', align_corners=False)
+            else:
+                generated_images = generated_images[0]  # Use 256x256 output
+    fig, axs = plt.subplots(2, num_images, figsize=(4 * num_images, 8.5))
+    resolution = "64x64" if "64" in set_name else "256x256"
+    fig.suptitle(
+        f"Epoch {epoch} - {set_name.capitalize()} Comparison ({resolution})", fontsize=16, y=0.98
+    )
+    for i in range(num_images):
+        ax_real = axs[0, i]
+        ax_real.imshow(denormalize_image(real_images[i].cpu()).permute(1, 2, 0))
+        ax_real.set_title(f"#{pokemon_ids[i]}: {descriptions[i][:35]}...", fontsize=10)
+        ax_real.axis("off")
+        ax_gen = axs[1, i]
+        ax_gen.imshow(denormalize_image(generated_images[i].cpu()).permute(1, 2, 0))
+        ax_gen.axis("off")
+    axs[0, 0].text(
+        -0.1,
+        0.5,
+        "Real",
+        ha="center",
+        va="center",
+        rotation="vertical",
+        fontsize=14,
+        transform=axs[0, 0].transAxes,
+    )
+    axs[1, 0].text(
+        -0.1,
+        0.5,
+        "Generated",
+        ha="center",
+        va="center",
+        rotation="vertical",
+        fontsize=14,
+        transform=axs[1, 0].transAxes,
+    )
+    plt.tight_layout(rect=(0, 0, 1, 0.95))
+    # Save the figure and optionally show it
+    save_path = os.path.join(output_dir, f"{epoch:03d}_{set_name}_comparison.png")
+    plt.savefig(save_path)
+    if show_inline:
+        plt.show()
+    else:
+        plt.close(fig)

pikapikagen/utils.py ADDED Viewed

	@@ -0,0 +1,12 @@

+def denormalize_image(tensor):
+    """
+    Denormalizza un tensore immagine dall'intervallo [-1, 1] a [0, 1] per la visualizzazione.
+    Args:
+        tensor (torch.Tensor): Il tensore dell'immagine, con valori in [-1, 1].
+    Returns:
+        torch.Tensor: Il tensore denormalizzato con valori in [0, 1].
+    """
+    tensor = (tensor + 1) / 2
+    return tensor.clamp(0, 1)

pyproject.toml ADDED Viewed

	@@ -0,0 +1,18 @@

+[project]
+name = "pikapikagen"
+version = "0.1.0"
+description = "Add your description here"
+readme = "README.md"
+requires-python = ">=3.12"
+dependencies = [
+    "gradio>=5.35.0",
+    "ipykernel>=6.29.5",
+    "ipywidgets>=8.1.7",
+    "jupyterlab>=4.4.5",
+    "matplotlib>=3.10.3",
+    "pandas>=2.3.0",
+    "sentence-transformers>=5.0.0",
+    "torch-fidelity>=0.3.0",
+    "torchvision>=0.22.1",
+    "transformers>=4.53.0",
+]

uv.lock ADDED Viewed

The diff for this file is too large to render. See raw diff