Spaces:

surokpro2
/

sae_flux

Runtime error

App Files Files Community

surokpro2 commited on Jun 11

Commit

215c4b7

verified ·

1 Parent(s): f3855f2

Upload 47 files

Browse files

Files changed (48) hide show

.gitattributes +1 -0
LICENSE +21 -0
README.MD +82 -0
SAE/__init__.py +1 -0
SAE/config.json +23 -0
SAE/dataset_iterator.py +53 -0
SAE/sae.py +216 -0
SAE/sae_utils.py +48 -0
SDLens/__init__.py +2 -0
SDLens/cache_and_edit/__init__.py +1 -0
SDLens/cache_and_edit/activation_cache.py +147 -0
SDLens/cache_and_edit/cached_pipeline.py +342 -0
SDLens/cache_and_edit/edits.py +223 -0
SDLens/cache_and_edit/flux_pipeline.py +998 -0
SDLens/cache_and_edit/hooks.py +108 -0
SDLens/cache_and_edit/inversion.py +568 -0
SDLens/cache_and_edit/metrics.py +116 -0
SDLens/cache_and_edit/qkv_cache.py +557 -0
SDLens/cache_and_edit/scheduler_inversion.py +98 -0
SDLens/hooked_scheduler.py +40 -0
SDLens/hooked_sd_pipeline.py +319 -0
app.ipynb +0 -0
app.py +768 -0
checkpoints/unet.down_blocks.2.attentions.1_k10_hidden5120_auxk256_bs4096_lr0.0001/final/config.json +1 -0
checkpoints/unet.down_blocks.2.attentions.1_k10_hidden5120_auxk256_bs4096_lr0.0001/final/mean.pt +3 -0
checkpoints/unet.down_blocks.2.attentions.1_k10_hidden5120_auxk256_bs4096_lr0.0001/final/state_dict.pth +3 -0
checkpoints/unet.down_blocks.2.attentions.1_k10_hidden5120_auxk256_bs4096_lr0.0001/final/std.pt +3 -0
checkpoints/unet.mid_block.attentions.0_k10_hidden5120_auxk256_bs4096_lr0.0001/final/config.json +1 -0
checkpoints/unet.mid_block.attentions.0_k10_hidden5120_auxk256_bs4096_lr0.0001/final/mean.pt +3 -0
checkpoints/unet.mid_block.attentions.0_k10_hidden5120_auxk256_bs4096_lr0.0001/final/state_dict.pth +3 -0
checkpoints/unet.mid_block.attentions.0_k10_hidden5120_auxk256_bs4096_lr0.0001/final/std.pt +3 -0
checkpoints/unet.up_blocks.0.attentions.0_k10_hidden5120_auxk256_bs4096_lr0.0001/final/config.json +1 -0
checkpoints/unet.up_blocks.0.attentions.0_k10_hidden5120_auxk256_bs4096_lr0.0001/final/mean.pt +3 -0
checkpoints/unet.up_blocks.0.attentions.0_k10_hidden5120_auxk256_bs4096_lr0.0001/final/state_dict.pth +3 -0
checkpoints/unet.up_blocks.0.attentions.0_k10_hidden5120_auxk256_bs4096_lr0.0001/final/std.pt +3 -0
checkpoints/unet.up_blocks.0.attentions.1_k10_hidden5120_auxk256_bs4096_lr0.0001/final/config.json +1 -0
checkpoints/unet.up_blocks.0.attentions.1_k10_hidden5120_auxk256_bs4096_lr0.0001/final/mean.pt +3 -0
checkpoints/unet.up_blocks.0.attentions.1_k10_hidden5120_auxk256_bs4096_lr0.0001/final/state_dict.pth +3 -0
checkpoints/unet.up_blocks.0.attentions.1_k10_hidden5120_auxk256_bs4096_lr0.0001/final/std.pt +3 -0
colab_requirements.txt +8 -0
example.ipynb +0 -0
requirements.txt +12 -0
resourses/image.png +3 -0
retrieval.py +71 -0
scripts/collect_latents_dataset.py +96 -0
scripts/train_sae.py +308 -0
utils/__init__.py +1 -0
utils/hooks.py +145 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+resourses/image.png filter=lfs diff=lfs merge=lfs -text

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2024 Viacheslav Surkov
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.MD ADDED Viewed

	@@ -0,0 +1,82 @@

+# Unpacking SDXL Turbo: Interpreting Text-to-Image Models with Sparse Autoencoders
+[![arXiv](https://img.shields.io/badge/arXiv-Paper-red)](https://arxiv.org/abs/2410.22366)
+[![Hugging Face Spaces Demo](https://img.shields.io/badge/Hugging%20Face-Demo-blue)](https://huggingface.co/spaces/surokpro2/Unboxing_SDXL_with_SAEs)
+[![Colab](https://img.shields.io/badge/Colab-Notebook-yellow)](https://colab.research.google.com/drive/1lWZ2yCRwCf4iuykvb-91QYUNkuzIwI3k?usp=sharing)
+![modification demostration](resourses/image.png)
+This repository contains code to reproduce results from our paper on using sparse autoencoders (SAEs) to analyze and interpret the internal representations of text-to-image diffusion models, specifically SDXL Turbo.
+## Repository Structure
+```
+|-- SAE/                    # Core sparse autoencoder implementation
+|-- SDLens/                 # Tools for analyzing diffusion models
+|   `-- hooked_sd_pipeline.py   # Modified stable diffusion pipeline
+|-- scripts/
+|   |-- collect_latents_dataset.py  # Generate training data
+|   `-- train_sae.py                    # Train SAE models
+|-- utils/
+|   `-- hooks.py           # Hook utility functions
+|-- checkpoints/           # Pretrained SAE model checkpoints
+|-- app.py                # Demo application
+|-- app.ipynb             # Interactive notebook demo
+|-- example.ipynb         # Usage examples
+`-- requirements.txt      # Python dependencies
+```
+## Installation
+```bash
+pip install -r requirements.txt
+```
+## Demo Application
+You can try our gradio demo application (`app.ipynb`) to browse and experiment with 20K+ features of our trained SAEs out-of-the-box. You can find the same notebook on [Google Colab](https://colab.research.google.com/drive/1lWZ2yCRwCf4iuykvb-91QYUNkuzIwI3k?usp=sharing).
+## Usage
+1. Collect latent data from SDXL Turbo:
+```bash
+python scripts/collect_latents_dataset.py --save_path={your_save_path}
+```
+2. Train sparse autoencoders:
+    2.1. Insert the path of stored latents and directory to store checkpoints in `SAE/config.json`
+    2.2. Run the training script:
+```bash
+python scripts/train_sae.py
+```
+## Pretrained Models
+We provide pretrained SAE checkpoints for 4 key transformer blocks in SDXL Turbo's U-Net in the `checkpoints` folder. See `example.ipynb` for analysis examples and visualization of learned features. More pretrained SAEs with different parameters are accessible through [HuggingFace repo](https://huggingface.co/surokpro2/sdxl-saes/tree/main).
+## Citation
+If you find this code useful in your research, please cite our paper:
+```bibtex
+@misc{surkov2024unpackingsdxlturbointerpreting,
+      title={Unpacking SDXL Turbo: Interpreting Text-to-Image Models with Sparse Autoencoders},
+      author={Viacheslav Surkov and Chris Wendler and Mikhail Terekhov and Justin Deschenaux and Robert West and Caglar Gulcehre},
+      year={2024},
+      eprint={2410.22366},
+      archivePrefix={arXiv},
+      primaryClass={cs.LG},
+      url={https://arxiv.org/abs/2410.22366},
+}
+```
+## Acknowledgements
+The SAE component was implemented based on [`openai/sparse_autoencoder`](https://github.com/openai/sparse_autoencoder) repository.

SAE/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .sae import SparseAutoencoder

SAE/config.json ADDED Viewed

	@@ -0,0 +1,23 @@

+{
+    "sae_configs": [
+        {
+            "d_model": 1280,
+            "n_dirs": 5120,
+            "k": 20
+        },
+        {
+            "d_model": 1280,
+            "n_dirs": 640,
+            "k": 20
+        }
+    ],
+    "bs": 4096,
+    "log_interval": 500,
+    "save_interval": 5000,
+    "paths_to_latents": [
+        "PASS YOUR PATHS HERE. Example /home/username/latents/<timestamp>. It should contain tar archives with latents."
+    ],
+    "save_path_base": "<Your SAE save path>",
+    "block_name": "unet.down_blocks.2.attentions.1"
+}

SAE/dataset_iterator.py ADDED Viewed

	@@ -0,0 +1,53 @@

+import webdataset as wds
+import os
+import torch
+class ActivationsDataloader:
+    def __init__(self, paths_to_datasets, block_name, batch_size, output_or_diff='diff', num_in_buffer=50):
+        assert output_or_diff in ['diff', 'output'], "Provide 'output' or 'diff'"
+        self.dataset = wds.WebDataset(
+            [os.path.join(path_to_dataset, f"{block_name}.tar")
+            for path_to_dataset in paths_to_datasets]
+        ).decode("torch")
+        self.iter = iter(self.dataset)
+        self.buffer = None
+        self.pointer = 0
+        self.num_in_buffer = num_in_buffer
+        self.output_or_diff = output_or_diff
+        self.batch_size = batch_size
+        self.one_size = None
+    def renew_buffer(self, to_retrieve):
+        to_merge = []
+        if self.buffer is not None and self.buffer.shape[0] > self.pointer:
+            to_merge = [self.buffer[self.pointer:].clone()]
+        del self.buffer
+        for _ in range(to_retrieve):
+            sample = next(self.iter)
+            latents = sample['output.pth'] if self.output_or_diff == 'output' else sample['diff.pth']
+            latents = latents.permute((0, 1, 3, 4, 2))
+            latents = latents.reshape((-1, latents.shape[-1]))
+            to_merge.append(latents.to('cuda'))
+            self.one_size = latents.shape[0]
+        self.buffer = torch.cat(to_merge, dim=0)
+        shuffled_indices = torch.randperm(self.buffer.shape[0])
+        self.buffer = self.buffer[shuffled_indices]
+        self.pointer = 0
+    def iterate(self):
+        while True:
+            if self.buffer == None or self.buffer.shape[0] - self.pointer < self.num_in_buffer * self.one_size * 4 // 5:
+                try:
+                    to_retrieve = self.num_in_buffer if self.buffer is None else self.num_in_buffer // 5
+                    self.renew_buffer(to_retrieve)
+                except StopIteration:
+                    break
+            batch = self.buffer[self.pointer: self.pointer + self.batch_size]
+            self.pointer += self.batch_size
+            assert batch.shape[0] == self.batch_size
+            yield batch

SAE/sae.py ADDED Viewed

	@@ -0,0 +1,216 @@

+'''
+Adapted from
+https://github.com/openai/sparse_autoencoder/blob/main/sparse_autoencoder/model.py
+'''
+import torch
+import torch.nn as nn
+import os
+import json
+class SparseAutoencoder(nn.Module):
+    """
+    Top-K Autoencoder with sparse kernels. Implements:
+        latents = relu(topk(encoder(x - pre_bias) + latent_bias))
+        recons = decoder(latents) + pre_bias
+    """
+    def __init__(
+        self,
+        n_dirs_local: int,
+        d_model: int,
+        k: int,
+        auxk: int | None,
+        dead_steps_threshold: int,
+    ):
+        super().__init__()
+        self.n_dirs_local = n_dirs_local
+        self.d_model = d_model
+        self.k = k
+        self.auxk = auxk
+        self.dead_steps_threshold = dead_steps_threshold
+        self.encoder = nn.Linear(d_model, n_dirs_local, bias=False)
+        self.decoder = nn.Linear(n_dirs_local, d_model, bias=False)
+        self.pre_bias = nn.Parameter(torch.zeros(d_model))
+        self.latent_bias = nn.Parameter(torch.zeros(n_dirs_local))
+        self.stats_last_nonzero: torch.Tensor
+        self.register_buffer("stats_last_nonzero", torch.zeros(n_dirs_local, dtype=torch.long))
+        def auxk_mask_fn(x):
+            dead_mask = self.stats_last_nonzero > dead_steps_threshold
+            x.data *= dead_mask  # inplace to save memory
+            return x
+        self.auxk_mask_fn = auxk_mask_fn
+        ## initialization
+        # "tied" init
+        self.decoder.weight.data = self.encoder.weight.data.T.clone()
+        # store decoder in column major layout for kernel
+        self.decoder.weight.data = self.decoder.weight.data.T.contiguous().T
+        unit_norm_decoder_(self)
+    def save_to_disk(self, path: str):
+        PATH_TO_CFG = 'config.json'
+        PATH_TO_WEIGHTS = 'state_dict.pth'
+        cfg = {
+            "n_dirs_local": self.n_dirs_local,
+            "d_model": self.d_model,
+            "k": self.k,
+            "auxk": self.auxk,
+            "dead_steps_threshold": self.dead_steps_threshold,
+        }
+        os.makedirs(path, exist_ok=True)
+        with open(os.path.join(path, PATH_TO_CFG), 'w') as f:
+            json.dump(cfg, f)
+        torch.save({
+            "state_dict": self.state_dict(),
+        }, os.path.join(path, PATH_TO_WEIGHTS))
+    @classmethod
+    def load_from_disk(cls, path: str):
+        PATH_TO_CFG = 'config.json'
+        PATH_TO_WEIGHTS = 'state_dict.pth'
+        with open(os.path.join(path, PATH_TO_CFG), 'r') as f:
+            cfg = json.load(f)
+        ae = cls(
+            n_dirs_local=cfg["n_dirs_local"],
+            d_model=cfg["d_model"],
+            k=cfg["k"],
+            auxk=cfg["auxk"],
+            dead_steps_threshold=cfg["dead_steps_threshold"],
+        )
+        state_dict = torch.load(os.path.join(path, PATH_TO_WEIGHTS))["state_dict"]
+        ae.load_state_dict(state_dict)
+        return ae
+    @property
+    def n_dirs(self):
+        return self.n_dirs_local
+    def encode(self, x):
+        x = x - self.pre_bias
+        latents_pre_act = self.encoder(x) + self.latent_bias
+        vals, inds = torch.topk(
+            latents_pre_act,
+            k=self.k,
+            dim=-1
+        )
+        latents = torch.zeros_like(latents_pre_act)
+        latents.scatter_(-1, inds, torch.relu(vals))
+        return latents
+    def forward(self, x):
+        x = x - self.pre_bias
+        latents_pre_act = self.encoder(x) + self.latent_bias
+        vals, inds = torch.topk(
+            latents_pre_act,
+            k=self.k,
+            dim=-1
+        )
+        ## set num nonzero stat ##
+        tmp = torch.zeros_like(self.stats_last_nonzero)
+        tmp.scatter_add_(
+            0,
+            inds.reshape(-1),
+            (vals > 1e-3).to(tmp.dtype).reshape(-1),
+        )
+        self.stats_last_nonzero *= 1 - tmp.clamp(max=1)
+        self.stats_last_nonzero += 1
+        ## end stats ##
+        ## auxk
+        if self.auxk is not None:  # for auxk
+            # IMPORTANT: has to go after stats update!
+            # WARN: auxk_mask_fn can mutate latents_pre_act!
+            auxk_vals, auxk_inds = torch.topk(
+                self.auxk_mask_fn(latents_pre_act),
+                k=self.auxk,
+                dim=-1
+            )
+        else:
+            auxk_inds = None
+            auxk_vals = None
+        ## end auxk
+        vals = torch.relu(vals)
+        if auxk_vals is not None:
+            auxk_vals = torch.relu(auxk_vals)
+        rows, cols = latents_pre_act.size()
+        row_indices = torch.arange(rows).unsqueeze(1).expand(-1, self.k).reshape(-1)
+        vals = vals.reshape(-1)
+        inds = inds.reshape(-1)
+        indices = torch.stack([row_indices.to(inds.device), inds])
+        sparse_tensor = torch.sparse_coo_tensor(indices, vals, torch.Size([rows, cols]))
+        recons = torch.sparse.mm(sparse_tensor, self.decoder.weight.T) + self.pre_bias
+        return recons, {
+            "inds": inds,
+            "vals": vals,
+            "auxk_inds": auxk_inds,
+            "auxk_vals": auxk_vals,
+        }
+    def decode_sparse(self, inds, vals):
+        rows, cols = inds.shape[0], self.n_dirs
+        row_indices = torch.arange(rows).unsqueeze(1).expand(-1, inds.shape[1]).reshape(-1)
+        vals = vals.reshape(-1)
+        inds = inds.reshape(-1)
+        indices = torch.stack([row_indices.to(inds.device), inds])
+        sparse_tensor = torch.sparse_coo_tensor(indices, vals, torch.Size([rows, cols]))
+        recons = torch.sparse.mm(sparse_tensor, self.decoder.weight.T) + self.pre_bias
+        return recons
+    @property
+    def device(self):
+        return next(self.parameters()).device
+def unit_norm_decoder_(autoencoder: SparseAutoencoder) -> None:
+    """
+    Unit normalize the decoder weights of an autoencoder.
+    """
+    autoencoder.decoder.weight.data /= autoencoder.decoder.weight.data.norm(dim=0)
+def unit_norm_decoder_grad_adjustment_(autoencoder) -> None:
+    """project out gradient information parallel to the dictionary vectors - assumes that the decoder is already unit normed"""
+    assert autoencoder.decoder.weight.grad is not None
+    autoencoder.decoder.weight.grad +=\
+        torch.einsum("bn,bn->n", autoencoder.decoder.weight.data, autoencoder.decoder.weight.grad) *\
+        autoencoder.decoder.weight.data * -1

SAE/sae_utils.py ADDED Viewed

	@@ -0,0 +1,48 @@

+import torch
+from dataclasses import dataclass, field
+import os
+@dataclass
+class SAETrainingConfig:
+    d_model: int
+    n_dirs: int
+    k: int
+    block_name: str
+    bs: int
+    save_path_base: str
+    auxk: int = 256
+    lr: float = 1e-4
+    eps: float = 6.25e-10
+    dead_toks_threshold: int = 10_000_000
+    auxk_coef: float = 1/32
+    @property
+    def sae_name(self):
+        return f'{self.block_name}_k{self.k}_hidden{self.n_dirs}_auxk{self.auxk}_bs{self.bs}_lr{self.lr}'
+    @property
+    def save_path(self):
+        return os.path.join(save_path_base, f'{self.block_name}_k{self.k}_hidden{self.n_dirs}_auxk{self.auxk}_bs{self.bs}_lr{self.lr}')
+@dataclass
+class Config:
+    saes: list[SAETrainingConfig]
+    paths_to_latents: list[str]
+    log_interval: int
+    save_interval: int
+    bs: int
+    block_name: str
+    wandb_project: str = 'sdxl_sae_train'
+    wandb_name: str = 'multiple_sae'
+    def __init__(self, cfg_json):
+        self.saes = [SAETrainingConfig(**sae_cfg, block_name=cfg_json['block_name'], bs=cfg_json['bs'], save_path_base=cfg_json['save_path_base'])
+                    for sae_cfg in cfg_json['sae_configs']]
+        self.save_path_base = cfg_json['save_path_base']
+        self.paths_to_latents = cfg_json['paths_to_latents']
+        self.log_interval = cfg_json['log_interval']
+        self.save_interval = cfg_json['save_interval']
+        self.bs = cfg_json['bs']
+        self.block_name = cfg_json['block_name']

SDLens/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .hooked_sd_pipeline import HookedIFPipeline, HookedStableDiffusionXLPipeline
2	+ from .cache_and_edit import CachedPipeline

SDLens/cache_and_edit/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .cached_pipeline import CachedPipeline

SDLens/cache_and_edit/activation_cache.py ADDED Viewed

	@@ -0,0 +1,147 @@

+from abc import ABC, abstractmethod
+from collections import defaultdict
+from typing import List
+from diffusers.models.transformers.transformer_flux import FluxTransformerBlock, FluxSingleTransformerBlock
+from SDLens.cache_and_edit.hooks import fix_inf_values_hook, register_general_hook
+import torch
+class ModelActivationCache(ABC):
+    """
+    Cache for inference pass of a Diffusion Transformer.
+    Used to cache residual-streams and activations.
+    """
+    def __init__(self):
+        # Initialize caches for "double transformer" blocks using the subclass-defined NUM_TRANSFORMER_BLOCKS
+        if hasattr(self, 'NUM_TRANSFORMER_BLOCKS'):
+            self.image_residual = []
+            self.image_activation = []
+            self.text_residual = []
+            self.text_activation = []
+        # Initialize caches for "single transformer" blocks if defined (using NUM_SINGLE_TRANSFORMER_BLOCKS)
+        if hasattr(self, 'NUM_SINGLE_TRANSFORMER_BLOCKS'):
+            self.text_image_residual = []
+            self.text_image_activation = []
+    def __str__(self):
+        lines = [f"{self.__class__.__name__}:"]
+        for attr_name, value in self.__dict__.items():
+            if isinstance(value, list) and all(isinstance(v, torch.Tensor) for v in value):
+                shapes = value[0].shape
+                lines.append(f"  {attr_name}: len={len(value)}, shapes={shapes}")
+            else:
+                lines.append(f"  {attr_name}: {type(value)}")
+        return "\n".join(lines)
+    def _repr_pretty_(self, p, cycle):
+        p.text(str(self))
+    @abstractmethod
+    def get_cache_info(self):
+        """
+        Return details about the cache configuration.
+        Subclasses must implement this to provide info on their transformer block counts.
+        """
+        pass
+class FluxActivationCache(ModelActivationCache):
+    # Define number of blocks for double and single transformer caches
+    NUM_TRANSFORMER_BLOCKS = 19
+    NUM_SINGLE_TRANSFORMER_BLOCKS = 38
+    def __init__(self):
+        super().__init__()
+    def get_cache_info(self):
+        return {
+            "transformer_blocks": self.NUM_TRANSFORMER_BLOCKS,
+            "single_transformer_blocks": self.NUM_SINGLE_TRANSFORMER_BLOCKS,
+        }
+    def __getitem__(self, key):
+        return getattr(self, key)
+class PixartActivationCache(ModelActivationCache):
+    # Define number of blocks for the double transformer cache only
+    NUM_TRANSFORMER_BLOCKS = 28
+    def __init__(self):
+        super().__init__()
+    def get_cache_info(self):
+        return {
+            "double_transformer_blocks": self.NUM_TRANSFORMER_BLOCKS,
+        }
+class ActivationCacheHandler:
+    """ Used to manage ModelActivationCache of a Diffusion Transformer.
+    """
+    def __init__(self, cache: ModelActivationCache, positions_to_cache: List[str] = None):
+        """Constructor.
+        Args:
+            cache (ModelActivationCache): cache to be used to store tensors.
+            positions_to_cache (List[str], optional): name of modules to cached.
+                If None, all modules as specified in `cache.get_cache_info()` will be cached. Defaults to None.
+        Raises:
+            NotImplementedError: _description_
+        Returns:
+            _type_: _description_
+        """
+        self.cache = cache
+        self.positions_to_cache = positions_to_cache
+    @torch.no_grad()
+    def cache_residual_and_activation_hook(self, *args):
+        """
+            To be used as a forward hook on a Transformer Block.
+            It caches both residual_stream and activation (defined as output - residual_stream).
+        """
+        if len(args) == 3:
+            module, input, output = args
+        elif len(args) == 4:
+            module, input, kwinput, output = args
+        if isinstance(module, FluxTransformerBlock):
+            encoder_hidden_states = output[0]
+            hidden_states = output[1]
+            self.cache.image_activation.append(hidden_states - kwinput["hidden_states"])
+            self.cache.text_activation.append(encoder_hidden_states - kwinput["encoder_hidden_states"])
+            self.cache.image_residual.append(kwinput["hidden_states"])
+            self.cache.text_residual.append(kwinput["encoder_hidden_states"])
+        elif isinstance(module, FluxSingleTransformerBlock):
+            self.cache.text_image_activation.append(output - kwinput["hidden_states"])
+            self.cache.text_image_residual.append(kwinput["hidden_states"])
+        else:
+            raise NotImplementedError(f"Caching not implemented for {type(module)}")
+    @property
+    def forward_hooks_dict(self):
+        # insert cache storing in dict
+        hooks = defaultdict(list)
+        if self.positions_to_cache is None:
+            for block_type, num_layers in self.cache.get_cache_info().items():
+                for i in range(num_layers):
+                    module_name: str = f"transformer.{block_type}.{i}"
+                    hooks[module_name].append(fix_inf_values_hook)
+                    hooks[module_name].append(self.cache_residual_and_activation_hook)
+        else:
+            for module_name in self.positions_to_cache:
+                hooks[module_name].append(fix_inf_values_hook)
+                hooks[module_name].append(self.cache_residual_and_activation_hook)
+        return hooks

SDLens/cache_and_edit/cached_pipeline.py ADDED Viewed

	@@ -0,0 +1,342 @@

+from collections import defaultdict
+from functools import partial
+import gc
+from typing import Callable, Dict, List, Literal, Union, Optional, Type, Union
+import torch
+from SDLens.cache_and_edit.activation_cache import FluxActivationCache, ModelActivationCache, PixartActivationCache, ActivationCacheHandler
+from diffusers.models.transformers.transformer_flux import FluxTransformerBlock, FluxSingleTransformerBlock
+from SDLens.cache_and_edit.hooks import locate_block, register_general_hook, fix_inf_values_hook, edit_streams_hook
+from SDLens.cache_and_edit.qkv_cache import QKVCacheFluxHandler, QKVCache, CachedFluxAttnProcessor3_0
+from SDLens.cache_and_edit.scheduler_inversion import FlowMatchEulerDiscreteSchedulerForInversion
+from SDLens.cache_and_edit.flux_pipeline import EditedFluxPipeline
+from diffusers.pipelines import FluxPipeline
+class CachedPipeline:
+    def __init__(self, pipe: EditedFluxPipeline, text_seq_length: int = 512):
+        assert isinstance(pipe, EditedFluxPipeline) or isinstance(pipe, FluxPipeline), "Use EditedFluxPipeline class in `cache_and_edit/flux_pipeline.py`"
+        self.pipe = pipe
+        self.text_seq_length = text_seq_length
+        # Cache handlers
+        self.activation_cache_handler = None
+        self.qkv_cache_handler = None
+        # keeps references to all registered hooks
+        self.registered_hooks = []
+    def setup_cache(self, use_activation_cache = True,
+                    use_qkv_cache = False,
+                    positions_to_cache: List[str] = None,
+                    positions_to_cache_foreground: List[str] = None,
+                    qkv_to_inject: QKVCache = None,
+                    inject_kv_mode: Literal["image", "text", "both"] = None,
+                    q_mask=None,
+                    processor_class: Optional[Type] = CachedFluxAttnProcessor3_0
+                    ) -> None:
+        """
+            Sets up activation_cache and/or qkv_cache, setting the required hooks.
+            If positions_to_cache is None, then all modules will be cached.
+            If inject_kv_mode is None, then qkv cache will be stored, otherwise qkv_to_inject will be injected.
+        """
+        if use_activation_cache:
+            if isinstance(self.pipe, EditedFluxPipeline) or isinstance(self.pipe, FluxPipeline):
+                activation_cache = FluxActivationCache()
+            else:
+                raise AssertionError(f"activation cache not implemented for {type(self.pipe)}")
+            self.activation_cache_handler = ActivationCacheHandler(activation_cache, positions_to_cache)
+            # register hooks crated by activation_cache
+            self._set_hooks(position_hook_dict=self.activation_cache_handler.forward_hooks_dict,
+                            with_kwargs=True)
+        if use_qkv_cache:
+            if isinstance(self.pipe, EditedFluxPipeline) or isinstance(self.pipe, FluxPipeline):
+                self.qkv_cache_handler = QKVCacheFluxHandler(self.pipe,
+                                                             positions_to_cache,
+                                                             positions_to_cache_foreground,
+                                                             inject_kv=inject_kv_mode,
+                                                             text_seq_length=self.text_seq_length,
+                                                             q_mask=q_mask,
+                                                             processor_class=processor_class,
+                                                             )
+            else:
+                raise AssertionError(f"QKV cache not implemented for {type(self.pipe)}")
+            # qkv_cache does not use hooks
+    @property
+    def activation_cache(self) -> ModelActivationCache:
+        return self.activation_cache_handler.cache if hasattr(self, "activation_cache_handler") and self.activation_cache_handler else None
+    @property
+    def qkv_cache(self) -> QKVCache:
+        return self.qkv_cache_handler.cache if hasattr(self, "qkv_cache_handler") and self.qkv_cache_handler else None
+    @torch.no_grad
+    def run(self,
+            prompt: Union[str, List[str]],
+            num_inference_steps: int = 1,
+            seed: int = 42,
+            width=1024,
+            height=1024,
+            cache_activations: bool = False,
+            cache_qkv: bool = False,
+            guidance_scale: float = 0.0,
+            positions_to_cache: List[str] = None,
+            empty_clip_embeddings: bool = True,
+            inverse: bool = False,
+            **kwargs):
+        """run the pipeline, possibly cachine activations or QKV.
+        Args:
+            prompt (str): Prompt to run the pipeline (NOTE: for Flux, parameters passed are prompt='' and prompt2=prompt)
+            num_inference_steps (int, optional): Num steps for inference. Defaults to 1.
+            seed (int, optional): seed for generators. Defaults to 42.
+            cache_activations (bool, optional): Whether to cache activations. Defaults to True.
+            cache_qkv (bool, optional): Whether to cache queries, keys, values. Defaults to False.
+            positions_to_cache (List[str], optional): list of blocks to cache.
+                    If None, all transformer blocks will be cached. Defaults to None.
+        Returns:
+            _type_: same output as wrapped pipeline.
+        """
+        # First, clear all registered hooks
+        self.clear_all_hooks()
+        # Delete cache already present
+        if self.activation_cache or self.qkv_cache:
+            if self.activation_cache:
+                del(self.activation_cache_handler.cache)
+                del(self.activation_cache_handler)
+            if self.qkv_cache:
+                # Necessary to delete the old cache.
+                self.qkv_cache_handler.clear_cache()
+                del(self.qkv_cache_handler)
+            gc.collect()  # force Python to clean up unreachable objects
+            torch.cuda.empty_cache()  # tell PyTorch to release unused GPU memory from its cache
+        # Setup cache again for the current inference pass
+        self.setup_cache(cache_activations, cache_qkv, positions_to_cache, inject_kv_mode=None)
+        assert isinstance(seed, int)
+        if isinstance(prompt, str):
+            empty_prompt = [""]
+            prompt = [prompt]
+        else:
+            empty_prompt = [""] * len(prompt)
+        gen = [torch.Generator(device="cpu").manual_seed(seed) for _ in range(len(prompt))]
+        if inverse:
+            # maybe create scheduler for inversion
+            if not hasattr(self, "inversion_scheduler"):
+                self.inversion_scheduler = FlowMatchEulerDiscreteSchedulerForInversion.from_config(
+                    self.pipe.scheduler.config,
+                    inverse=True
+                )
+                self.og_scheduler = self.pipe.scheduler
+            self.pipe.scheduler = self.inversion_scheduler
+        output = self.pipe(
+                prompt=empty_prompt if empty_clip_embeddings else prompt,
+                prompt_2=prompt,
+                num_inference_steps=num_inference_steps,
+                guidance_scale=guidance_scale,
+                generator=gen,
+                width=width,
+                height=height,
+                **kwargs
+            )
+        # Restore original scheduler
+        if inverse:
+            self.pipe.scheduler = self.og_scheduler
+        return output
+    @torch.no_grad
+    def run_inject_qkv(self,
+            prompt: Union[str, List[str]],
+            positions_to_inject: List[str] = None,
+            positions_to_inject_foreground: List[str] = None,
+            inject_kv_mode: Literal["image", "text", "both"] = "image",
+            num_inference_steps: int = 1,
+            guidance_scale: float = 0.0,
+            seed: int = 42,
+            empty_clip_embeddings: bool = True,
+            q_mask=None,
+            width: int = 1024,
+            height: int = 1024,
+            processor_class: Optional[Type] = CachedFluxAttnProcessor3_0,
+            **kwargs):
+        """run the pipeline, possibly cachine activations or QKV.
+        Args:
+            prompt (str): Prompt to run the pipeline (NOTE: for Flux, parameters passed are prompt='' and prompt2=prompt)
+            num_inference_steps (int, optional): Num steps for inference. Defaults to 1.
+            seed (int, optional): seed for generators. Defaults to 42.
+            cache_activations (bool, optional): Whether to cache activations. Defaults to True.
+            cache_qkv (bool, optional): Whether to cache queries, keys, values. Defaults to False.
+            positions_to_cache (List[str], optional): list of blocks to cache.
+                    If None, all transformer blocks will be cached. Defaults to None.
+        Returns:
+            _type_: same output as wrapped pipeline.
+        """
+        # First, clear all registered hooks
+        self.clear_all_hooks()
+        # Delete previous QKVCache
+        if hasattr(self, "qkv_cache_handler") and self.qkv_cache_handler is not None:
+            self.qkv_cache_handler.clear_cache()
+            del(self.qkv_cache_handler)
+            gc.collect()  # force Python to clean up unreachable objects
+            torch.cuda.empty_cache()  # tell PyTorch to release unused GPU memory from its cache
+        # Will setup existing QKV cache to be injected
+        self.setup_cache(use_activation_cache=False,
+                         use_qkv_cache=True,
+                         positions_to_cache=positions_to_inject,
+                         positions_to_cache_foreground=positions_to_inject_foreground,
+                         inject_kv_mode=inject_kv_mode,
+                         q_mask=q_mask,
+                         processor_class=processor_class,
+                         )
+        self.qkv_cache_handler
+        assert isinstance(seed, int)
+        if isinstance(prompt, str):
+            empty_prompt = [""]
+            prompt = [prompt]
+        else:
+            empty_prompt = [""] * len(prompt)
+        gen = [torch.Generator(device="cpu").manual_seed(seed) for _ in range(len(prompt))]
+        output = self.pipe(
+                prompt=empty_prompt if empty_clip_embeddings else prompt,
+                prompt_2=prompt,
+                num_inference_steps=num_inference_steps,
+                guidance_scale=guidance_scale,
+                generator=gen,
+                width=width,
+                height=height,
+                **kwargs
+            )
+        return output
+    def clear_all_hooks(self):
+        # 1. Clear all registered hooks
+        for hook in self.registered_hooks:
+                hook.remove()
+        self.registered_hooks = []
+        # 2. Eventually clear other hooks registered in the pipeline but not present here
+        # TODO: make it general for other models
+        for i in range(len(locate_block(self.pipe, "transformer.transformer_blocks"))):
+            locate_block(self.pipe, f"transformer.transformer_blocks.{i}")._forward_hooks.clear()
+        for i in range(len(locate_block(self.pipe, "transformer.single_transformer_blocks"))):
+            locate_block(self.pipe, f"transformer.single_transformer_blocks.{i}")._forward_hooks.clear()
+    def _set_hooks(self,
+                   position_hook_dict: Dict[str, List[Callable]] = {},
+                   position_pre_hook_dict: Dict[str, List[Callable]] = {},
+                   with_kwargs=False
+    ):
+        '''
+        Set hooks at specified positions and register them.
+        Args:
+            position_hook_dict: A dictionary mapping positions to hooks.
+                The keys are positions in the pipeline where the hooks should be registered.
+                The values are either a single hook or a list of hooks to be registered at the specified position.
+                Each hook should be a callable that takes three arguments: (module, input, output).
+            **kwargs: Keyword arguments to pass to the pipeline.
+        '''
+        # Register hooks
+        for is_pre_hook, hook_dict in [(True, position_pre_hook_dict), (False, position_hook_dict)]:
+            for position, hook in hook_dict.items():
+                assert isinstance(hook, list)
+                for h in hook:
+                    self.registered_hooks.append(register_general_hook(self.pipe, position, h, with_kwargs, is_pre_hook))
+    def run_with_edit(self,
+                      prompt: str,
+                      edit_fn: callable,
+                      layers_for_edit_fn: List[int],
+                      stream: Literal['text', 'image', 'both'],
+                      guidance_scale: float = 0.0,
+                      seed=42,
+                      num_inference_steps=1,
+                      empty_clip_embeddings: bool = True,
+                      width: int = 1024,
+                      height: int = 1024,
+                      **kwargs,
+                    ):
+        assert isinstance(seed, int)
+        self.clear_all_hooks()
+        # Setup hooks for edit_fn at the specified layers
+        # NOTE: edit_fn_hooks has to be Dict[str, List[Callable]]
+        edit_fn_hooks = {f"transformer.transformer_blocks.{layer}": [lambda *args: edit_streams_hook(*args, recompute_fn=edit_fn, stream=stream)]
+                            for layer in layers_for_edit_fn if layer < 19}
+        edit_fn_hooks.update({f"transformer.single_transformer_blocks.{layer - 19}": [lambda *args: edit_streams_hook(*args, recompute_fn=edit_fn, stream=stream)]
+                                for layer in layers_for_edit_fn if layer >= 19})
+        # register hooks in the pipe
+        self._set_hooks(position_hook_dict=edit_fn_hooks, with_kwargs=True)
+        # Create generators
+        if isinstance(prompt, str):
+            empty_prompt = [""]
+            prompt = [prompt]
+        else:
+            empty_prompt = [""] * len(prompt)
+        gen = [torch.Generator(device="cpu").manual_seed(seed) for _ in range(len(prompt))]
+        with torch.no_grad():
+            output = self.pipe(
+                prompt=empty_prompt if empty_clip_embeddings else prompt,
+                prompt_2=prompt,
+                num_inference_steps=num_inference_steps,
+                guidance_scale=guidance_scale,
+                generator=gen,
+                width=width,
+                height=height,
+                **kwargs
+            )
+        return output

SDLens/cache_and_edit/edits.py ADDED Viewed

	@@ -0,0 +1,223 @@

+class Edit:
+    def __init__(self, ablator, vanilla_pre_forward_dict: Callable[[str, int], dict],
+                                vanilla_forward_dict: Callable[[str, int], dict],
+                                ablated_pre_forward_dict: Callable[[str, int], dict],
+                                ablated_forward_dict: Callable[[str, int], dict],):
+        self.ablator=ablator
+        self.vanilla_seed = 42
+        self.vanilla_pre_forward_dict = vanilla_pre_forward_dict
+        self.vanilla_forward_dict = vanilla_forward_dict
+        self.ablated_seed = 42
+        self.ablated_pre_forward_dict = ablated_pre_forward_dict
+        self.ablated_forward_dict = ablated_forward_dict
+    def get_edit(name: str):
+        if name == "edit_streams":
+            ablator = TransformerActivationCache()
+            stream: str = kwargs["stream"]
+            layers = kwargs["layers"]
+            edit_fn: Callable[[torch.Tensor, torch.Tensor], torch.Tensor] = kwargs["edit_fn"]
+            interventions = {f"transformer.transformer_blocks.{layer}": lambda *args: ablator.edit_streams(*args, recompute_fn=partial(edit_fn, layer=layer), stream=stream) for layer in layers if layer < 19}
+            interventions.update({f"transformer.single_transformer_blocks.{layer - 19}": lambda *args: ablator.edit_streams(*args, recompute_fn=partial(edit_fn, layer=layer), stream=stream) for layer in layers if layer >= 19})
+            return Ablation(ablator,
+                            vanilla_pre_forward_dict=lambda block_type, layer_num:  {},
+                            vanilla_forward_dict=lambda block_type, layer_num: {},
+                            ablated_pre_forward_dict=lambda block_type, layer_num: {},
+                            ablated_forward_dict=lambda block_type, layer_num: interventions,
+                        )
+"""
+    def get_ablation(name: str, **kwargs):
+        if name == "intermediate_text_stream_to_input":
+            ablator = TransformerActivationCache()
+            return Ablation(ablator,
+                            vanilla_pre_forward_dict=lambda block_type, layer_num: {},
+                            vanilla_forward_dict=lambda block_type, layer_num: {f"transformer.{block_type}.{layer_num}": lambda *args: ablator.cache_attention_activation(*args, full_output=True)},
+                            ablated_pre_forward_dict=lambda block_type, layer_num: {f"transformer.transformer_blocks.0": lambda *args: ablator.replace_stream_input(*args, stream="text")},
+                            ablated_forward_dict=lambda block_type, layer_num: {})
+        elif name == "input_to_intermediate_text_stream":
+            ablator = TransformerActivationCache()
+            return Ablation(ablator,
+                            vanilla_pre_forward_dict=lambda block_type, layer_num: {},
+                            vanilla_forward_dict=lambda block_type, layer_num: {f"transformer.transformer_blocks.0": lambda *args: ablator.cache_attention_activation(*args, full_output=True)},
+                            ablated_pre_forward_dict=lambda block_type, layer_num: {f"transformer.{block_type}.{layer_num}": lambda *args: ablator.replace_stream_input(*args, stream="text")},
+                            ablated_forward_dict=lambda block_type, layer_num: {})
+        elif name == "set_input_text":
+            tensor: torch.Tensor = kwargs["tensor"]
+            ablator = TransformerActivationCache()
+            return Ablation(ablator,
+                            vanilla_pre_forward_dict=lambda block_type, layer_num: {},
+                            vanilla_forward_dict=lambda block_type, layer_num: {},
+                            ablated_pre_forward_dict=lambda block_type, layer_num: {f"transformer.{block_type}.0": lambda *args: ablator.replace_stream_input(*args, use_tensor=tensor, stream="text")},
+                            ablated_forward_dict=lambda block_type, layer_num: {f"transformer.{block_type}.0": lambda *args: ablator.clamp_output(*args)})
+        elif name == "replace_text_stream_activation":
+            ablator = AttentionAblationCacheHook()
+            weight = kwargs["weight"] if "weight" in kwargs else 1.0
+            return Ablation(ablator,
+                            vanilla_pre_forward_dict=lambda block_type, layer_num: {f"transformer.{block_type}.{layer_num}": ablator.cache_text_stream},
+                            vanilla_forward_dict=lambda block_type, layer_num: {},
+                            ablated_pre_forward_dict=lambda block_type, layer_num: {f"transformer.{block_type}.{layer_num}": ablator.cache_and_inject_pre_forward},
+                            ablated_forward_dict=lambda block_type, layer_num: {f"transformer.{block_type}.{layer_num}": lambda *args: ablator.set_ablated_attention(*args, weight=weight)})
+        elif name == "replace_text_stream":
+            ablator = TransformerActivationCache()
+            weight = kwargs["weight"] if "weight" in kwargs else 1.0
+            return Ablation(ablator,
+                            vanilla_pre_forward_dict=lambda block_type, layer_num: {f"transformer.{block_type}.{layer_num}": ablator.cache_text_stream},
+                            vanilla_forward_dict=lambda block_type, layer_num: {},
+                            ablated_pre_forward_dict=lambda block_type, layer_num: {f"transformer.{block_type}.{layer_num}": ablator.cache_and_inject_pre_forward},
+                            ablated_forward_dict=lambda block_type, layer_num: {})
+        elif name == "input=output":
+            return Ablation(None,
+                            vanilla_pre_forward_dict=lambda block_type, layer_num: {},
+                            vanilla_forward_dict=lambda block_type, layer_num: {},
+                            ablated_pre_forward_dict=lambda block_type, layer_num: {},
+                            ablated_forward_dict=lambda block_type, layer_num: {f"transformer.{block_type}.{layer_num}": lambda *args: ablate_block(*args)})
+        elif name == "reweight_text_stream":
+            ablator = TransformerActivationCache()
+            residual_w=kwargs["residual_w"]
+            activation_w=kwargs["activation_w"]
+            return Ablation(ablator,
+                            vanilla_pre_forward_dict=lambda block_type, layer_num: {},
+                            vanilla_forward_dict=lambda block_type, layer_num: {},
+                            ablated_pre_forward_dict=lambda block_type, layer_num: {},
+                            ablated_forward_dict=lambda block_type, layer_num: {f"transformer.{block_type}.{layer_num}": lambda *args: ablator.reweight_text_stream(*args, residual_w=residual_w, activation_w=activation_w)})
+        elif name == "add_input_text":
+            tensor: torch.Tensor = kwargs["tensor"]
+            ablator = TransformerActivationCache()
+            return Ablation(ablator,
+                            vanilla_pre_forward_dict=lambda block_type, layer_num: {},
+                            vanilla_forward_dict=lambda block_type, layer_num: {},
+                            ablated_pre_forward_dict=lambda block_type, layer_num: {f"transformer.{block_type}.0": lambda *args: ablator.add_text_stream_input(*args, use_tensor=tensor)},
+                            ablated_forward_dict=lambda block_type, layer_num: {f"transformer.{block_type}.0": lambda *args: ablator.clamp_output(*args)})
+        elif name == "nothing":
+            ablator = TransformerActivationCache()
+            return Ablation(ablator,
+                            vanilla_pre_forward_dict=lambda block_type, layer_num: {},
+                            vanilla_forward_dict=lambda block_type, layer_num: {},
+                            ablated_pre_forward_dict=lambda block_type, layer_num: {},
+                            ablated_forward_dict=lambda block_type, layer_num: {})
+        elif name == "reweight_image_stream":
+            ablator = TransformerActivationCache()
+            residual_w=kwargs["residual_w"]
+            activation_w=kwargs["activation_w"]
+            return Ablation(ablator,
+                            vanilla_pre_forward_dict=lambda block_type, layer_num: {},
+                            vanilla_forward_dict=lambda block_type, layer_num: {},
+                            ablated_pre_forward_dict=lambda block_type, layer_num: {},
+                            ablated_forward_dict=lambda block_type, layer_num: {f"transformer.{block_type}.{layer_num}": lambda *args: ablator.reweight_image_stream(*args, residual_w=residual_w, activation_w=activation_w)})
+        if name == "intermediate_image_stream_to_input":
+            ablator = TransformerActivationCache()
+            return Ablation(ablator,
+                            vanilla_pre_forward_dict=lambda block_type, layer_num: {},
+                            vanilla_forward_dict=lambda block_type, layer_num: {f"transformer.{block_type}.{layer_num}": lambda *args: ablator.cache_attention_activation(*args, full_output=True)},
+                            ablated_pre_forward_dict=lambda block_type, layer_num: {f"transformer.transformer_blocks.0": lambda *args: ablator.replace_stream_input(*args, stream='image')},
+                            ablated_forward_dict=lambda block_type, layer_num: {})
+        elif name == "replace_text_stream_one_layer":
+            ablator = AttentionAblationCacheHook()
+            weight = kwargs["weight"] if "weight" in kwargs else 1.0
+            return Ablation(ablator,
+                            vanilla_pre_forward_dict=lambda block_type, layer_num: {f"transformer.{block_type}.{layer_num}": ablator.cache_text_stream},
+                            vanilla_forward_dict=lambda block_type, layer_num: {},
+                            ablated_pre_forward_dict=lambda block_type, layer_num: {f"transformer.{block_type}.{layer_num}": ablator.cache_and_inject_pre_forward},
+                            ablated_forward_dict=lambda block_type, layer_num: {f"transformer.{block_type}.{layer_num}": ablator.restore_text_stream})
+        elif name == "replace_intermediate_representation":
+            ablator = TransformerActivationCache()
+            tensor: torch.Tensor = kwargs["tensor"]
+            return Ablation(ablator,
+                            vanilla_pre_forward_dict=lambda block_type, layer_num: {},
+                            vanilla_forward_dict=lambda block_type, layer_num: {},
+                            ablated_pre_forward_dict=lambda block_type, layer_num: {f"transformer.single_transformer_blocks.0": lambda *args: ablator.replace_stream_input(*args, use_tensor=tensor, stream='text_image')},
+                            ablated_forward_dict=lambda block_type, layer_num: {})
+        elif name == "destroy_registers":
+            ablator = TransformerActivationCache()
+            layers: List[int] = kwargs['layers']
+            k: float = kwargs["k"]
+            stream: str = kwargs['stream']
+            random: bool = kwargs["random"] if "random" in kwargs else False
+            lowest_norm: bool = kwargs["lowest_norm"] if "lowest_norm" in kwargs else False
+            return Ablation(ablator,
+                            vanilla_pre_forward_dict=lambda block_type, layer_num: {},
+                            vanilla_forward_dict=lambda block_type, layer_num: {},
+                            ablated_pre_forward_dict=lambda block_type, layer_num: {f"transformer.single_transformer_blocks.{i}": lambda *args: ablator.destroy_registers(*args,  k=k, stream=stream, random_ablation=random, lowest_norm=lowest_norm) for i in layers},
+                            ablated_forward_dict=lambda block_type, layer_num: {})
+        elif name == "patch_registers":
+            ablator = TransformerActivationCache()
+            layers: List[int] = kwargs['layers']
+            k: float = kwargs["k"]
+            stream: str = kwargs['stream']
+            random: bool = kwargs["random"] if "random" in kwargs else False
+            lowest_norm: bool = kwargs["lowest_norm"] if "lowest_norm" in kwargs else False
+            return Ablation(ablator,
+                            vanilla_pre_forward_dict=lambda block_type, layer_num:  {f"transformer.single_transformer_blocks.{i}": lambda *args: ablator.destroy_registers(*args, k=k, stream=stream, random_ablation=random, lowest_norm=lowest_norm) for i in layers},
+                            vanilla_forward_dict=lambda block_type, layer_num: {},
+                            ablated_pre_forward_dict=lambda block_type, layer_num: {f"transformer.single_transformer_blocks.{i}": lambda *args: ablator.set_cached_registers(*args, k=k, stream=stream, random_ablation=random, lowest_norm=lowest_norm) for i in layers},
+                            ablated_forward_dict=lambda block_type, layer_num: {})
+        elif name == "add_registers":
+            ablator = TransformerActivationCache()
+            num_registers: int = kwargs["num_registers"]
+            return Ablation(ablator,
+                            vanilla_pre_forward_dict=lambda block_type, layer_num:  {},
+                            vanilla_forward_dict=lambda block_type, layer_num: {},
+                            ablated_pre_forward_dict=lambda block_type, layer_num: {f"transformer": lambda *args: insert_extra_registers(*args, num_registers=num_registers)},
+                            ablated_forward_dict=lambda block_type, layer_num: {f"transformer": lambda *args: discard_extra_registers(*args, num_registers=num_registers)},)
+        elif name == "edit_streams":
+            ablator = TransformerActivationCache()
+            stream: str = kwargs["stream"]
+            layers = kwargs["layers"]
+            edit_fn: Callable[[torch.Tensor, torch.Tensor], torch.Tensor] = kwargs["edit_fn"]
+            interventions = {f"transformer.transformer_blocks.{layer}": lambda *args: ablator.edit_streams(*args, recompute_fn=partial(edit_fn, layer=layer), stream=stream) for layer in layers if layer < 19}
+            interventions.update({f"transformer.single_transformer_blocks.{layer - 19}": lambda *args: ablator.edit_streams(*args, recompute_fn=partial(edit_fn, layer=layer), stream=stream) for layer in layers if layer >= 19})
+            return Ablation(ablator,
+                            vanilla_pre_forward_dict=lambda block_type, layer_num:  {},
+                            vanilla_forward_dict=lambda block_type, layer_num: {},
+                            ablated_pre_forward_dict=lambda block_type, layer_num: {},
+                            ablated_forward_dict=lambda block_type, layer_num: interventions,
+                        )
+"""

SDLens/cache_and_edit/flux_pipeline.py ADDED Viewed

	@@ -0,0 +1,998 @@

+# Copyright 2024 Black Forest Labs and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Union
+import numpy as np
+import torch
+from transformers import (
+    CLIPImageProcessor,
+    CLIPTextModel,
+    CLIPTokenizer,
+    CLIPVisionModelWithProjection,
+    T5EncoderModel,
+    T5TokenizerFast,
+)
+from diffusers.image_processor import PipelineImageInput, VaeImageProcessor
+from diffusers.loaders import FluxIPAdapterMixin, FluxLoraLoaderMixin, FromSingleFileMixin, TextualInversionLoaderMixin
+from diffusers.models.autoencoders import AutoencoderKL
+from diffusers.models.transformers import FluxTransformer2DModel
+from diffusers.schedulers import FlowMatchEulerDiscreteScheduler
+from diffusers.utils import (
+    USE_PEFT_BACKEND,
+    is_torch_xla_available,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from diffusers.utils.torch_utils import randn_tensor
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+from diffusers.pipelines.flux.pipeline_output import FluxPipelineOutput
+if is_torch_xla_available():
+    import torch_xla.core.xla_model as xm
+    XLA_AVAILABLE = True
+else:
+    XLA_AVAILABLE = False
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import FluxPipeline
+        >>> pipe = FluxPipeline.from_pretrained("black-forest-labs/FLUX.1-schnell", torch_dtype=torch.bfloat16)
+        >>> pipe.to("cuda")
+        >>> prompt = "A cat holding a sign that says hello world"
+        >>> # Depending on the variant being used, the pipeline call will slightly vary.
+        >>> # Refer to the pipeline documentation for more details.
+        >>> image = pipe(prompt, num_inference_steps=4, guidance_scale=0.0).images[0]
+        >>> image.save("flux.png")
+        ```
+"""
+def calculate_shift(
+    image_seq_len,
+    base_seq_len: int = 256,
+    max_seq_len: int = 4096,
+    base_shift: float = 0.5,
+    max_shift: float = 1.16,
+):
+    m = (max_shift - base_shift) / (max_seq_len - base_seq_len)
+    b = base_shift - m * base_seq_len
+    mu = image_seq_len * m + b
+    return mu
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    r"""
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+class EditedFluxPipeline(
+    DiffusionPipeline,
+    FluxLoraLoaderMixin,
+    FromSingleFileMixin,
+    TextualInversionLoaderMixin,
+    FluxIPAdapterMixin,
+):
+    r"""
+    The Flux pipeline for text-to-image generation.
+    Reference: https://blackforestlabs.ai/announcing-black-forest-labs/
+    Args:
+        transformer ([`FluxTransformer2DModel`]):
+            Conditional Transformer (MMDiT) architecture to denoise the encoded image latents.
+        scheduler ([`FlowMatchEulerDiscreteScheduler`]):
+            A scheduler to be used in combination with `transformer` to denoise the encoded image latents.
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        text_encoder_2 ([`T5EncoderModel`]):
+            [T5](https://huggingface.co/docs/transformers/en/model_doc/t5#transformers.T5EncoderModel), specifically
+            the [google/t5-v1_1-xxl](https://huggingface.co/google/t5-v1_1-xxl) variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/en/model_doc/clip#transformers.CLIPTokenizer).
+        tokenizer_2 (`T5TokenizerFast`):
+            Second Tokenizer of class
+            [T5TokenizerFast](https://huggingface.co/docs/transformers/en/model_doc/t5#transformers.T5TokenizerFast).
+    """
+    model_cpu_offload_seq = "text_encoder->text_encoder_2->image_encoder->transformer->vae"
+    _optional_components = ["image_encoder", "feature_extractor"]
+    _callback_tensor_inputs = ["latents", "prompt_embeds"]
+    def __init__(
+        self,
+        scheduler: FlowMatchEulerDiscreteScheduler,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        text_encoder_2: T5EncoderModel,
+        tokenizer_2: T5TokenizerFast,
+        transformer: FluxTransformer2DModel,
+        image_encoder: CLIPVisionModelWithProjection = None,
+        feature_extractor: CLIPImageProcessor = None,
+    ):
+        super().__init__()
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            text_encoder_2=text_encoder_2,
+            tokenizer=tokenizer,
+            tokenizer_2=tokenizer_2,
+            transformer=transformer,
+            scheduler=scheduler,
+            image_encoder=image_encoder,
+            feature_extractor=feature_extractor,
+        )
+        self.vae_scale_factor = (
+            2 ** (len(self.vae.config.block_out_channels) - 1) if hasattr(self, "vae") and self.vae is not None else 8
+        )
+        # Flux latents are turned into 2x2 patches and packed. This means the latent width and height has to be divisible
+        # by the patch size. So the vae scale factor is multiplied by the patch size to account for this
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor * 2)
+        self.tokenizer_max_length = (
+            self.tokenizer.model_max_length if hasattr(self, "tokenizer") and self.tokenizer is not None else 77
+        )
+        self.default_sample_size = 128
+    def _get_t5_prompt_embeds(
+        self,
+        prompt: Union[str, List[str]] = None,
+        num_images_per_prompt: int = 1,
+        max_sequence_length: int = 512,
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+    ):
+        device = device or self._execution_device
+        dtype = dtype or self.text_encoder.dtype
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        batch_size = len(prompt)
+        if isinstance(self, TextualInversionLoaderMixin):
+            prompt = self.maybe_convert_prompt(prompt, self.tokenizer_2)
+        text_inputs = self.tokenizer_2(
+            prompt,
+            padding="max_length",
+            max_length=max_sequence_length,
+            truncation=True,
+            return_length=False,
+            return_overflowing_tokens=False,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        untruncated_ids = self.tokenizer_2(prompt, padding="longest", return_tensors="pt").input_ids
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer_2.batch_decode(untruncated_ids[:, self.tokenizer_max_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because `max_sequence_length` is set to "
+                f" {max_sequence_length} tokens: {removed_text}"
+            )
+        prompt_embeds = self.text_encoder_2(text_input_ids.to(device), output_hidden_states=False)[0]
+        dtype = self.text_encoder_2.dtype
+        prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+        _, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings and attention mask for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+        return prompt_embeds
+    def _get_clip_prompt_embeds(
+        self,
+        prompt: Union[str, List[str]],
+        num_images_per_prompt: int = 1,
+        device: Optional[torch.device] = None,
+    ):
+        device = device or self._execution_device
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        batch_size = len(prompt)
+        if isinstance(self, TextualInversionLoaderMixin):
+            prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer_max_length,
+            truncation=True,
+            return_overflowing_tokens=False,
+            return_length=False,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer_max_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.tokenizer_max_length} tokens: {removed_text}"
+            )
+        prompt_embeds = self.text_encoder(text_input_ids.to(device), output_hidden_states=False)
+        # Use pooled output of CLIPTextModel
+        prompt_embeds = prompt_embeds.pooler_output
+        prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt)
+        prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, -1)
+        return prompt_embeds
+    def encode_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        prompt_2: Union[str, List[str]],
+        device: Optional[torch.device] = None,
+        num_images_per_prompt: int = 1,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        max_sequence_length: int = 512,
+        lora_scale: Optional[float] = None,
+    ):
+        r"""
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                used in all text-encoders
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            lora_scale (`float`, *optional*):
+                A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+        """
+        device = device or self._execution_device
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, FluxLoraLoaderMixin):
+            self._lora_scale = lora_scale
+            # dynamically adjust the LoRA scale
+            if self.text_encoder is not None and USE_PEFT_BACKEND:
+                scale_lora_layers(self.text_encoder, lora_scale)
+            if self.text_encoder_2 is not None and USE_PEFT_BACKEND:
+                scale_lora_layers(self.text_encoder_2, lora_scale)
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        if prompt_embeds is None:
+            prompt_2 = prompt_2 or prompt
+            prompt_2 = [prompt_2] if isinstance(prompt_2, str) else prompt_2
+            # We only use the pooled prompt output from the CLIPTextModel
+            pooled_prompt_embeds = self._get_clip_prompt_embeds(
+                prompt=prompt,
+                device=device,
+                num_images_per_prompt=num_images_per_prompt,
+            )
+            prompt_embeds = self._get_t5_prompt_embeds(
+                prompt=prompt_2,
+                num_images_per_prompt=num_images_per_prompt,
+                max_sequence_length=max_sequence_length,
+                device=device,
+            )
+        if self.text_encoder is not None:
+            if isinstance(self, FluxLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
+        if self.text_encoder_2 is not None:
+            if isinstance(self, FluxLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder_2, lora_scale)
+        dtype = self.text_encoder.dtype if self.text_encoder is not None else self.transformer.dtype
+        text_ids = torch.zeros(prompt_embeds.shape[1], 3).to(device=device, dtype=dtype)
+        return prompt_embeds, pooled_prompt_embeds, text_ids
+    def encode_image(self, image, device, num_images_per_prompt):
+        dtype = next(self.image_encoder.parameters()).dtype
+        if not isinstance(image, torch.Tensor):
+            image = self.feature_extractor(image, return_tensors="pt").pixel_values
+        image = image.to(device=device, dtype=dtype)
+        image_embeds = self.image_encoder(image).image_embeds
+        image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+        return image_embeds
+    def prepare_ip_adapter_image_embeds(
+        self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt
+    ):
+        image_embeds = []
+        if ip_adapter_image_embeds is None:
+            if not isinstance(ip_adapter_image, list):
+                ip_adapter_image = [ip_adapter_image]
+            if len(ip_adapter_image) != len(self.transformer.encoder_hid_proj.image_projection_layers):
+                raise ValueError(
+                    f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.transformer.encoder_hid_proj.image_projection_layers)} IP Adapters."
+                )
+            for single_ip_adapter_image, image_proj_layer in zip(
+                ip_adapter_image, self.transformer.encoder_hid_proj.image_projection_layers
+            ):
+                single_image_embeds = self.encode_image(single_ip_adapter_image, device, 1)
+                image_embeds.append(single_image_embeds[None, :])
+        else:
+            for single_image_embeds in ip_adapter_image_embeds:
+                image_embeds.append(single_image_embeds)
+        ip_adapter_image_embeds = []
+        for i, single_image_embeds in enumerate(image_embeds):
+            single_image_embeds = torch.cat([single_image_embeds] * num_images_per_prompt, dim=0)
+            single_image_embeds = single_image_embeds.to(device=device)
+            ip_adapter_image_embeds.append(single_image_embeds)
+        return ip_adapter_image_embeds
+    def check_inputs(
+        self,
+        prompt,
+        prompt_2,
+        height,
+        width,
+        negative_prompt=None,
+        negative_prompt_2=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        pooled_prompt_embeds=None,
+        negative_pooled_prompt_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+        max_sequence_length=None,
+    ):
+        if height % (self.vae_scale_factor * 2) != 0 or width % (self.vae_scale_factor * 2) != 0:
+            logger.warning(
+                f"`height` and `width` have to be divisible by {self.vae_scale_factor * 2} but are {height} and {width}. Dimensions will be resized accordingly"
+            )
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt_2 is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt_2`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+        elif prompt_2 is not None and (not isinstance(prompt_2, str) and not isinstance(prompt_2, list)):
+            raise ValueError(f"`prompt_2` has to be of type `str` or `list` but is {type(prompt_2)}")
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+        elif negative_prompt_2 is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt_2`: {negative_prompt_2} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+        if prompt_embeds is not None and pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `prompt_embeds` are provided, `pooled_prompt_embeds` also have to be passed. Make sure to generate `pooled_prompt_embeds` from the same text encoder that was used to generate `prompt_embeds`."
+            )
+        if negative_prompt_embeds is not None and negative_pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `negative_prompt_embeds` are provided, `negative_pooled_prompt_embeds` also have to be passed. Make sure to generate `negative_pooled_prompt_embeds` from the same text encoder that was used to generate `negative_prompt_embeds`."
+            )
+        if max_sequence_length is not None and max_sequence_length > 512:
+            raise ValueError(f"`max_sequence_length` cannot be greater than 512 but is {max_sequence_length}")
+    @staticmethod
+    def _prepare_latent_image_ids(batch_size, height, width, device, dtype):
+        latent_image_ids = torch.zeros(height, width, 3)
+        latent_image_ids[..., 1] = latent_image_ids[..., 1] + torch.arange(height)[:, None]
+        latent_image_ids[..., 2] = latent_image_ids[..., 2] + torch.arange(width)[None, :]
+        latent_image_id_height, latent_image_id_width, latent_image_id_channels = latent_image_ids.shape
+        latent_image_ids = latent_image_ids.reshape(
+            latent_image_id_height * latent_image_id_width, latent_image_id_channels
+        )
+        return latent_image_ids.to(device=device, dtype=dtype)
+    @staticmethod
+    def _pack_latents(latents, batch_size, num_channels_latents, height, width):
+        latents = latents.view(batch_size, num_channels_latents, height // 2, 2, width // 2, 2)
+        latents = latents.permute(0, 2, 4, 1, 3, 5)
+        latents = latents.reshape(batch_size, (height // 2) * (width // 2), num_channels_latents * 4)
+        return latents
+    @staticmethod
+    def _unpack_latents(latents, height, width, vae_scale_factor):
+        batch_size, num_patches, channels = latents.shape
+        # VAE applies 8x compression on images but we must also account for packing which requires
+        # latent height and width to be divisible by 2.
+        height = 2 * (int(height) // (vae_scale_factor * 2))
+        width = 2 * (int(width) // (vae_scale_factor * 2))
+        latents = latents.view(batch_size, height // 2, width // 2, channels // 4, 2, 2)
+        latents = latents.permute(0, 3, 1, 4, 2, 5)
+        latents = latents.reshape(batch_size, channels // (2 * 2), height, width)
+        return latents
+    def enable_vae_slicing(self):
+        r"""
+        Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
+        compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
+        """
+        self.vae.enable_slicing()
+    def disable_vae_slicing(self):
+        r"""
+        Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
+        computing decoding in one step.
+        """
+        self.vae.disable_slicing()
+    def enable_vae_tiling(self):
+        r"""
+        Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
+        compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
+        processing larger images.
+        """
+        self.vae.enable_tiling()
+    def disable_vae_tiling(self):
+        r"""
+        Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
+        computing decoding in one step.
+        """
+        self.vae.disable_tiling()
+    def prepare_latents(
+        self,
+        batch_size,
+        num_channels_latents,
+        height,
+        width,
+        dtype,
+        device,
+        generator,
+        latents=None,
+    ):
+        # VAE applies 8x compression on images but we must also account for packing which requires
+        # latent height and width to be divisible by 2.
+        height = 2 * (int(height) // (self.vae_scale_factor * 2))
+        width = 2 * (int(width) // (self.vae_scale_factor * 2))
+        shape = (batch_size, num_channels_latents, height, width)
+        if latents is not None:
+            latent_image_ids = self._prepare_latent_image_ids(batch_size, height // 2, width // 2, device, dtype)
+            return latents.to(device=device, dtype=dtype), latent_image_ids
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+        latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        latents = self._pack_latents(latents, batch_size, num_channels_latents, height, width)
+        latent_image_ids = self._prepare_latent_image_ids(batch_size, height // 2, width // 2, device, dtype)
+        return latents, latent_image_ids
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+    @property
+    def joint_attention_kwargs(self):
+        return self._joint_attention_kwargs
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+    @property
+    def interrupt(self):
+        return self._interrupt
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        prompt_2: Optional[Union[str, List[str]]] = None,
+        negative_prompt: Union[str, List[str]] = None,
+        negative_prompt_2: Optional[Union[str, List[str]]] = None,
+        true_cfg_scale: float = 1.0,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 28,
+        sigmas: Optional[List[float]] = None,
+        guidance_scale: float = 3.5,
+        num_images_per_prompt: Optional[int] = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
+        negative_ip_adapter_image: Optional[PipelineImageInput] = None,
+        negative_ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        joint_attention_kwargs: Optional[Dict[str, Any]] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        max_sequence_length: int = 512,
+        is_inverted_generation: bool = False,
+        inverted_latents_list: List[torch.Tensor] = None,
+        tau_b: Optional[float] = None,
+        bg_consistency_mask: Optional[torch.Tensor] = None,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                will be used instead
+            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The height in pixels of the generated image. This is set to 1024 by default for the best results.
+            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The width in pixels of the generated image. This is set to 1024 by default for the best results.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            sigmas (`List[float]`, *optional*):
+                Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
+                their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
+                will be used.
+            guidance_scale (`float`, *optional*, defaults to 7.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
+            ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*):
+                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
+                IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. If not
+                provided, embeddings are computed from the `ip_adapter_image` input argument.
+            negative_ip_adapter_image:
+                (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
+            negative_ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*):
+                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
+                IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. If not
+                provided, embeddings are computed from the `ip_adapter_image` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.flux.FluxPipelineOutput`] instead of a plain tuple.
+            joint_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+            max_sequence_length (`int` defaults to 512): Maximum sequence length to use with the `prompt`.
+            tau_b (`float`, *optional*): Proportion of steps during which the background consistency is applied.
+            bg_consistency_mask (`torch.Tensor`, *optional*): Mask to use when applying background consistency. The mask
+                background consistency will be applied to the areas outside of the mask.
+        Examples:
+        Returns:
+            [`~pipelines.flux.FluxPipelineOutput`] or `tuple`: [`~pipelines.flux.FluxPipelineOutput`] if `return_dict`
+            is True, otherwise a `tuple`. When returning a tuple, the first element is a list with the generated
+            images.
+        """
+        height = height or self.default_sample_size * self.vae_scale_factor
+        width = width or self.default_sample_size * self.vae_scale_factor
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            prompt_2,
+            height,
+            width,
+            negative_prompt=negative_prompt,
+            negative_prompt_2=negative_prompt_2,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+            callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
+            max_sequence_length=max_sequence_length,
+        )
+        self._guidance_scale = guidance_scale
+        self._joint_attention_kwargs = joint_attention_kwargs
+        self._interrupt = False
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        device = self._execution_device
+        lora_scale = (
+            self.joint_attention_kwargs.get("scale", None) if self.joint_attention_kwargs is not None else None
+        )
+        do_true_cfg = true_cfg_scale > 1 and negative_prompt is not None
+        (
+            prompt_embeds,
+            pooled_prompt_embeds,
+            text_ids,
+        ) = self.encode_prompt(
+            prompt=prompt,
+            prompt_2=prompt_2,
+            prompt_embeds=prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            max_sequence_length=max_sequence_length,
+            lora_scale=lora_scale,
+        )
+        if do_true_cfg:
+            (
+                negative_prompt_embeds,
+                negative_pooled_prompt_embeds,
+                _,
+            ) = self.encode_prompt(
+                prompt=negative_prompt,
+                prompt_2=negative_prompt_2,
+                prompt_embeds=negative_prompt_embeds,
+                pooled_prompt_embeds=negative_pooled_prompt_embeds,
+                device=device,
+                num_images_per_prompt=num_images_per_prompt,
+                max_sequence_length=max_sequence_length,
+                lora_scale=lora_scale,
+            )
+        # 4. Prepare latent variables
+        num_channels_latents = self.transformer.config.in_channels // 4
+        latents, latent_image_ids = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+        # 5. Prepare timesteps
+        sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps) if sigmas is None else sigmas
+        image_seq_len = latents.shape[1]
+        mu = calculate_shift(
+            image_seq_len,
+            self.scheduler.config.base_image_seq_len,
+            self.scheduler.config.max_image_seq_len,
+            self.scheduler.config.base_shift,
+            self.scheduler.config.max_shift,
+        )
+        timesteps, num_inference_steps = retrieve_timesteps(
+            self.scheduler,
+            num_inference_steps,
+            device,
+            sigmas=sigmas,
+            mu=mu,
+        )
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+        self._num_timesteps = len(timesteps)
+        if is_inverted_generation:
+            timesteps = reversed(timesteps)
+        # handle guidance
+        if self.transformer.config.guidance_embeds:
+            guidance = torch.full([1], guidance_scale, device=device, dtype=torch.float32)
+            guidance = guidance.expand(latents.shape[0])
+        else:
+            guidance = None
+        if (ip_adapter_image is not None or ip_adapter_image_embeds is not None) and (
+            negative_ip_adapter_image is None and negative_ip_adapter_image_embeds is None
+        ):
+            negative_ip_adapter_image = np.zeros((width, height, 3), dtype=np.uint8)
+        elif (ip_adapter_image is None and ip_adapter_image_embeds is None) and (
+            negative_ip_adapter_image is not None or negative_ip_adapter_image_embeds is not None
+        ):
+            ip_adapter_image = np.zeros((width, height, 3), dtype=np.uint8)
+        if self.joint_attention_kwargs is None:
+            self._joint_attention_kwargs = {}
+        image_embeds = None
+        negative_image_embeds = None
+        if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
+            image_embeds = self.prepare_ip_adapter_image_embeds(
+                ip_adapter_image,
+                ip_adapter_image_embeds,
+                device,
+                batch_size * num_images_per_prompt,
+            )
+        if negative_ip_adapter_image is not None or negative_ip_adapter_image_embeds is not None:
+            negative_image_embeds = self.prepare_ip_adapter_image_embeds(
+                negative_ip_adapter_image,
+                negative_ip_adapter_image_embeds,
+                device,
+                batch_size * num_images_per_prompt,
+            )
+        # 6. Denoising loop
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+                if image_embeds is not None:
+                    self._joint_attention_kwargs["ip_adapter_image_embeds"] = image_embeds
+                # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+                timestep = t.expand(latents.shape[0]).to(latents.dtype)
+                noise_pred = self.transformer(
+                    hidden_states=latents,
+                    timestep=timestep / 1000,
+                    guidance=guidance,
+                    pooled_projections=pooled_prompt_embeds,
+                    encoder_hidden_states=prompt_embeds,
+                    txt_ids=text_ids,
+                    img_ids=latent_image_ids,
+                    joint_attention_kwargs=self.joint_attention_kwargs,
+                    return_dict=False,
+                )[0]
+                if do_true_cfg:
+                    if negative_image_embeds is not None:
+                        self._joint_attention_kwargs["ip_adapter_image_embeds"] = negative_image_embeds
+                    neg_noise_pred = self.transformer(
+                        hidden_states=latents,
+                        timestep=timestep / 1000,
+                        guidance=guidance,
+                        pooled_projections=negative_pooled_prompt_embeds,
+                        encoder_hidden_states=negative_prompt_embeds,
+                        txt_ids=text_ids,
+                        img_ids=latent_image_ids,
+                        joint_attention_kwargs=self.joint_attention_kwargs,
+                        return_dict=False,
+                    )[0]
+                    noise_pred = neg_noise_pred + true_cfg_scale * (noise_pred - neg_noise_pred)
+                # compute the previous noisy sample x_t -> x_t-1
+                latents_dtype = latents.dtype
+                latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
+                if tau_b:
+                    if bg_consistency_mask is None:
+                        raise ValueError("if tau_b is set, bg_consistency_mask must be provided for background consistency to work.")
+                    assert latents.shape[0] >= 3, "Three processes are required for background consistency injection (being background, foreground and composed process)."
+                    assert latents.shape[1] == bg_consistency_mask.shape[0], f"Latents and segmentation mask must have the same number of timesteps. Got {latents.shape[1]} and {bg_consistency_mask.shape[0]}."
+                    bg_consistency_mask = bg_consistency_mask.to(device=latents.device, dtype=torch.int32)
+                    # TF-ICON background consistency: if we're in the first tau_b part of the de-noising process,
+                    # overwrite the latents of the composed image with those of the background process (only outside the segmentation mask)
+                    if i <= tau_b * num_inference_steps:
+                        latents[2, :, :] = latents[0, :, :] * (1 - bg_consistency_mask) + latents[2, :, :] * bg_consistency_mask
+                # NOTE: this was the added part for inversion
+                if is_inverted_generation:
+                    inverted_latents_list.append(latents)
+                else:
+                    if inverted_latents_list is not None:
+                        if isinstance(inverted_latents_list[0], torch.Tensor):
+                            latents[0] = inverted_latents_list[-i][0]
+                        else:
+                            assert isinstance(inverted_latents_list[0], tuple)
+                            for j, tensor_tuple in enumerate(inverted_latents_list[-i]):
+                                latents[j] = tensor_tuple
+                if latents.dtype != latents_dtype:
+                    if torch.backends.mps.is_available():
+                        # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
+                        latents = latents.to(latents_dtype)
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                if XLA_AVAILABLE:
+                    xm.mark_step()
+        if output_type == "latent":
+            image = latents
+        else:
+            latents = self._unpack_latents(latents, height, width, self.vae_scale_factor)
+            latents = (latents / self.vae.config.scaling_factor) + self.vae.config.shift_factor
+            image = self.vae.decode(latents, return_dict=False)[0]
+            image = self.image_processor.postprocess(image, output_type=output_type)
+        # Offload all models
+        self.maybe_free_model_hooks()
+        if not return_dict:
+            return (image,)
+        return FluxPipelineOutput(images=image)

SDLens/cache_and_edit/hooks.py ADDED Viewed

	@@ -0,0 +1,108 @@

+from typing import Callable, Literal
+import torch
+import torch.nn as nn
+from diffusers.models.transformers.transformer_flux import FluxTransformerBlock, FluxSingleTransformerBlock
+def register_general_hook(pipe, position, hook, with_kwargs=False, is_pre_hook=False):
+    """Registers a forward hook in a module of the pipeline specified with 'position'
+    Args:
+        pipe (_type_): _description_
+        position (_type_): _description_
+        hook (_type_): _description_
+        with_kwargs (bool, optional): _description_. Defaults to False.
+        is_pre_hook (bool, optional): _description_. Defaults to False.
+    Returns:
+        _type_: _description_
+    """
+    block: nn.Module = locate_block(pipe, position)
+    if is_pre_hook:
+        return block.register_forward_pre_hook(hook, with_kwargs=with_kwargs)
+    else:
+        return block.register_forward_hook(hook, with_kwargs=with_kwargs)
+def locate_block(pipe, position: str) -> nn.Module:
+    '''
+    Locate the block at the specified position in the pipeline.
+    '''
+    block = pipe
+    for step in position.split('.'):
+        if step.isdigit():
+            step = int(step)
+            block = block[step]
+        else:
+            block = getattr(block, step)
+    return block
+def _safe_clip(x: torch.Tensor):
+    if x.dtype == torch.float16:
+        x[torch.isposinf(x)] = 65504
+        x[torch.isneginf(x)] = -65504
+    return x
+@torch.no_grad()
+def fix_inf_values_hook(*args):
+    # Case 1: no kwards are passed to the module
+    if len(args) == 3:
+        module, input, output = args
+    # Case 2: when kwargs are passed to the model as input
+    elif len(args) == 4:
+        module, input, kwinput, output = args
+    if isinstance(module, FluxTransformerBlock):
+        return _safe_clip(output[0]), _safe_clip(output[1])
+    elif isinstance(module, FluxSingleTransformerBlock):
+        return _safe_clip(output)
+@torch.no_grad()
+def edit_streams_hook(*args,
+                      recompute_fn: Callable[[torch.Tensor, torch.Tensor], torch.Tensor],
+                      stream: Literal["text", "image", "both"]):
+    """
+        recompute_fn will get as input the input tensor and the output tensor for such stream
+        and returns what should be the new modified output
+    """
+    # Case 1: no kwards are passed to the module
+    if len(args) == 3:
+        module, input, output = args
+    # Case 2: when kwargs are passed to the model as input
+    elif len(args) == 4:
+        module, input, kwinput, output = args
+    else:
+        raise AssertionError(f'Weird len(args):{len(args)}')
+    if isinstance(module, FluxTransformerBlock):
+        if stream == 'text':
+            output_text = recompute_fn(kwinput["encoder_hidden_states"], output[0])
+            output_image = output[1]
+        elif stream == 'image':
+            output_image = recompute_fn(kwinput["hidden_states"], output[1])
+            output_text = output[0]
+        else:
+            raise AssertionError("Branch not supported for this layer.")
+        return _safe_clip(output_text), _safe_clip(output_image)
+    elif isinstance(module, FluxSingleTransformerBlock):
+        if stream == 'text':
+            output[:, :512] = recompute_fn(kwinput["hidden_states"][:, :512], output[:, :512])
+        elif stream == 'image':
+            output[:, 512:] = recompute_fn(kwinput["hidden_states"][:, 512:], output[:, 512:])
+        else:
+            output = recompute_fn(kwinput["hidden_states"], output)
+        return _safe_clip(output)

SDLens/cache_and_edit/inversion.py ADDED Viewed

	@@ -0,0 +1,568 @@

+from typing import Optional, Tuple
+import torch
+import torchvision.transforms.functional as TF
+from PIL import Image
+from cache_and_edit import CachedPipeline
+import numpy as np
+from IPython.display import display
+from cache_and_edit.flux_pipeline import EditedFluxPipeline
+def image2latent(pipe, image, latent_nudging_scalar = 1.15):
+    image = pipe.image_processor.preprocess(image).type(pipe.vae.dtype).to("cuda")
+    latents = pipe.vae.encode(image)["latent_dist"].mean
+    latents = (latents - pipe.vae.config.shift_factor) * pipe.vae.config.scaling_factor
+    latents = latents * latent_nudging_scalar
+    latents = pipe._pack_latents(
+        latents=latents,
+        batch_size=1,
+        num_channels_latents=16,
+        height=image.size(2) // 8,
+        width= image.size(3) // 8
+    )
+    return latents
+def get_inverted_input_noise(pipe: CachedPipeline,
+                             image,
+                             prompt: str = "",
+                             num_steps: int = 28,
+                             latent_nudging_scalar: int = 1.15):
+    """_summary_
+    Args:
+        pipe (CachedPipeline): _description_
+        image (_type_): _description_
+        num_steps (int, optional): _description_. Defaults to 28.
+    Returns:
+        _type_: _description_
+    """
+    width, height = image.size
+    inverted_latents_list = []
+    if isinstance(pipe.pipe, EditedFluxPipeline):
+        _ = pipe.run(
+            prompt,
+            num_inference_steps=num_steps,
+            seed=42,
+            guidance_scale=1,
+            output_type="latent",
+            latents=image2latent(pipe.pipe, image, latent_nudging_scalar=latent_nudging_scalar),
+            empty_clip_embeddings=False,
+            inverse=True,
+            width=width,
+            height=height,
+            is_inverted_generation=True,
+            inverted_latents_list=inverted_latents_list
+        ).images[0]
+        return inverted_latents_list
+    else:
+        noise = pipe.run(
+            prompt,
+            num_inference_steps=num_steps,
+            seed=42,
+            guidance_scale=1,
+            output_type="latent",
+            latents=image2latent(pipe.pipe, image, latent_nudging_scalar=latent_nudging_scalar),
+            empty_clip_embeddings=False,
+            inverse=True,
+            width=width,
+            height=height
+        ).images[0]
+        return noise
+def resize_bounding_box(
+    bb_mask: torch.Tensor,
+    target_size: Tuple[int, int] = (64, 64),
+) -> torch.Tensor:
+    """
+    Given a bounding box mask, patches it into a mask with the target size.
+    The mask is a 2D tensor of shape (H, W) where each element is either 0 or 1.
+    Any patch that contains at least one 1 in the original mask will be set to 1 in the output mask.
+    Args:
+        bb_mask (torch.Tensor): The bounding box mask as a boolean tensor of shape (H, W).
+        target_size (Tuple[int, int]): The size of the target mask as a tuple (H, W).
+    Returns:
+        torch.Tensor: The resized bounding box mask as a boolean tensor of shape (H, W).
+    """
+    w_mask, h_mask = bb_mask.shape[-2:]
+    w_target, h_target = target_size
+    # Make sure the sizes are compatible
+    if w_mask % w_target != 0 or h_mask % h_target != 0:
+        raise ValueError(
+            f"Mask size {bb_mask.shape[-2:]} is not compatible with target size {target_size}"
+        )
+    # Compute the size of a patch
+    patch_size = (w_mask // w_target, h_mask // h_target)
+    # Iterate over the mask, one patch at a time, and save a 0 patch if the patch is empty or a 1 patch if the patch is not empty
+    out_mask = torch.zeros((w_target, h_target), dtype=bb_mask.dtype, device=bb_mask.device)
+    for i in range(w_target):
+        for j in range(h_target):
+            patch = bb_mask[
+                i * patch_size[0] : (i + 1) * patch_size[0],
+                j * patch_size[1] : (j + 1) * patch_size[1],
+            ]
+            if torch.sum(patch) > 0:
+                out_mask[i, j] = 1
+            else:
+                out_mask[i, j] = 0
+    return out_mask
+def place_image_in_bounding_box(
+    image_tensor_whc: torch.Tensor,
+    mask_tensor_wh: torch.Tensor
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """
+    Resizes an input image to fit within a bounding box (from a mask)
+    preserving aspect ratio, and places it centered on a new canvas.
+    Args:
+        image_tensor_whc: Input image tensor, shape [width, height, channels].
+        mask_tensor_wh: Bounding box mask, shape [width, height]. Defines canvas size
+                          and contains a rectangle of 1s for the BB.
+    Returns:
+        A tuple:
+        - output_image_whc (torch.Tensor): Canvas with the resized image placed.
+                                           Shape [canvas_width, canvas_height, channels].
+        - new_mask_wh (torch.Tensor): Mask showing the actual placement of the image.
+                                      Shape [canvas_width, canvas_height].
+    """
+    # Validate input image dimensions
+    if not (image_tensor_whc.ndim == 3 and image_tensor_whc.shape[0] > 0 and image_tensor_whc.shape[1] > 0):
+        raise ValueError(
+            "Input image_tensor_whc must be a 3D tensor [width, height, channels] "
+            "with width > 0 and height > 0."
+        )
+    img_orig_w, img_orig_h, num_channels = image_tensor_whc.shape
+    # Validate mask tensor dimensions
+    if not (mask_tensor_wh.ndim == 2):
+        raise ValueError("Input mask_tensor_wh must be a 2D tensor [width, height].")
+    canvas_w, canvas_h = mask_tensor_wh.shape
+    # Prepare default empty outputs for early exit scenarios
+    empty_output_image = torch.zeros(
+        canvas_w, canvas_h, num_channels,
+        dtype=image_tensor_whc.dtype, device=image_tensor_whc.device
+    )
+    empty_new_mask = torch.zeros(
+        canvas_w, canvas_h,
+        dtype=mask_tensor_wh.dtype, device=mask_tensor_wh.device
+    )
+    # 1. Find Bounding Box (BB) coordinates from the input mask_tensor_wh
+    #    fg_coords shape: [N, 2], where N is num_nonzero. Each row: [x_coord, y_coord].
+    fg_coords = torch.nonzero(mask_tensor_wh, as_tuple=False)
+    if fg_coords.numel() == 0: # No bounding box found in mask
+        return empty_output_image, empty_new_mask
+    # Determine min/max extents of the bounding box
+    x_min_bb, y_min_bb = fg_coords[:, 0].min(), fg_coords[:, 1].min()
+    x_max_bb, y_max_bb = fg_coords[:, 0].max(), fg_coords[:, 1].max()
+    bb_target_w = x_max_bb - x_min_bb + 1
+    bb_target_h = y_max_bb - y_min_bb + 1
+    if bb_target_w <= 0 or bb_target_h <= 0: # Should not happen if fg_coords not empty
+        return empty_output_image, empty_new_mask
+    # 2. Prepare image for resizing: TF.resize expects [C, H, W]
+    #    Input image_tensor_whc is [W, H, C]. Permute to [C, H_orig, W_orig].
+    image_tensor_chw = image_tensor_whc.permute(2, 1, 0)
+    # 3. Calculate new dimensions for the image to fit in BB, preserving aspect ratio
+    scale_factor_w = bb_target_w / img_orig_w
+    scale_factor_h = bb_target_h / img_orig_h
+    scale = min(scale_factor_w, scale_factor_h) # Fit entirely within BB
+    resized_img_w = int(img_orig_w * scale)
+    resized_img_h = int(img_orig_h * scale)
+    if resized_img_w == 0 or resized_img_h == 0: # Image scaled to nothing
+        return empty_output_image, empty_new_mask
+    # 4. Resize the image. TF.resize expects size as [H, W].
+    try:
+        # antialias=True for better quality (requires torchvision >= 0.8.0 approx)
+        resized_image_chw = TF.resize(image_tensor_chw, [resized_img_h, resized_img_w], antialias=True)
+    except TypeError: # Fallback for older torchvision versions
+        resized_image_chw = TF.resize(image_tensor_chw, [resized_img_h, resized_img_w])
+    # Permute resized image back to [W, H, C] format
+    resized_image_whc = resized_image_chw.permute(2, 1, 0)
+    # 5. Create the output canvas image (initialized to zeros)
+    output_image_whc = torch.zeros(
+        canvas_w, canvas_h, num_channels,
+        dtype=image_tensor_whc.dtype, device=image_tensor_whc.device
+    )
+    # 6. Calculate pasting coordinates to center the resized image within the original BB
+    offset_x = (bb_target_w - resized_img_w) // 2
+    offset_y = (bb_target_h - resized_img_h) // 2
+    paste_x_start = x_min_bb + offset_x
+    paste_y_start = y_min_bb + offset_y
+    paste_x_end = paste_x_start + resized_img_w
+    paste_y_end = paste_y_start + resized_img_h
+    # Place the resized image onto the canvas
+    output_image_whc[paste_x_start:paste_x_end, paste_y_start:paste_y_end, :] = resized_image_whc
+    # 7. Create the new mask representing where the image was actually placed
+    new_mask_wh = torch.zeros(
+        canvas_w, canvas_h,
+        dtype=mask_tensor_wh.dtype, device=mask_tensor_wh.device
+    )
+    new_mask_wh[paste_x_start:paste_x_end, paste_y_start:paste_y_end] = 1
+    return output_image_whc, new_mask_wh
+### Function to cut image and put it in bounding box (either cut or not cut)
+def compose_noise_masks(cached_pipe,
+                  foreground_image: Image,
+                  background_image: Image,
+                  target_mask: torch.Tensor,
+                  foreground_mask: torch.Tensor,
+                  option: str = "bg", # bg, bg_fg, segmentation1, tf_icon
+                  photoshop_fg_noise: bool = False,
+                  num_inversion_steps: int = 100,
+                  ):
+    """
+    Composes noise masks for image generation using different strategies.
+    This function composes noise masks for stable diffusion inversion, with several composition strategies:
+    - "bg": Uses only background noise
+    - "bg_fg": Combines background and foreground noise using a target mask
+    - "segmentation1": Uses segmentation mask to compose foreground and background noise
+    - "segmentation2": Implements advanced composition with additional boundary noise
+    Parameters:
+    ----------
+    cached_pipe : object
+        The cached stable diffusion pipeline used for noise inversion
+    foreground_image : PIL.Image
+        The foreground image to be placed in the background
+    background_image : PIL.Image
+        The background image
+    target_mask : torch.Tensor
+        Target mask indicating the position where the foreground should be placed
+    foreground_mask : torch.Tensor
+        Segmentation mask of the foreground object
+    option : str, default="bg"
+        Composition strategy: "bg", "bg_fg", "segmentation1", or "segmentation2"
+    photoshop_fg_noise : bool, default=False
+        Whether to generate noise from a photoshopped composition of foreground and background
+    num_inversion_steps : int, default=100
+        Number of steps for the inversion process
+    Returns:
+    -------
+    dict
+        A dictionary containing:
+        - "noise": Dictionary of generated noises (composed_noise, foreground_noise, background_noise)
+        - "latent_masks": Dictionary of latent masks used for composition
+    """
+    # assert options
+    assert option in ["bg", "bg_fg", "segmentation1", "segmentation2"], f"Invalid option: {option}"
+    # calculate size of latent noise for mask resizing
+    PATCH_SIZE = 16
+    latent_size = background_image.size[0] // PATCH_SIZE
+    latents = (latent_size, latent_size)
+    # process the options
+    if option == "bg":
+        # only background noise
+        bg_noise = get_inverted_input_noise(cached_pipe, background_image, num_steps=num_inversion_steps)
+        composed_noise = bg_noise
+        all_noise = {
+                "composed_noise": composed_noise,
+                "background_noise": bg_noise,
+                }
+        all_latent_masks = {}
+    elif option == "bg_fg":
+        # resize and scale the image to the bounding box
+        reframed_fg_img, resized_mask = place_image_in_bounding_box(
+        torch.from_numpy(np.array(foreground_image)),
+        (torch.from_numpy(np.array(target_mask)) / 255.0).to(dtype=bool)
+        )
+        #print("Placed Foreground Image")
+        reframed_fg_img = Image.fromarray(reframed_fg_img.numpy())
+        #display(reframed_fg_img)
+        #print("Placed Mask")
+        resized_mask_img = Image.fromarray((resized_mask.numpy() * 255).astype(np.uint8))
+        #display(resized_mask_img)
+        # invert resized & padded image
+        if photoshop_fg_noise:
+            #print("Photoshopping FG IMAGE")
+            photoshop_img = Image.fromarray(
+                (torch.tensor(np.array(background_image)) * ~resized_mask.cpu().unsqueeze(-1) + torch.tensor(np.array(reframed_fg_img)) * resized_mask.cpu().unsqueeze(-1)).numpy()
+            )
+            #display(photoshop_img)
+            fg_noise = get_inverted_input_noise(cached_pipe, photoshop_img, num_steps=num_inversion_steps)
+        else:
+            fg_noise = get_inverted_input_noise(cached_pipe, reframed_fg_img, num_steps=num_inversion_steps)
+        bg_noise = get_inverted_input_noise(cached_pipe, background_image, num_steps=num_inversion_steps)
+        # overwrite get masked in latent space
+        latent_mask = resize_bounding_box(
+            resized_mask,
+            target_size=latents,
+                ).flatten().unsqueeze(-1).to("cuda")
+        # compose the noise
+        composed_noise = bg_noise * (~latent_mask) + fg_noise * latent_mask
+        all_latent_masks = {
+            "latent_mask": latent_mask,
+                }
+        all_noise = {
+                "composed_noise": composed_noise,
+                "foreground_noise": fg_noise,
+                "background_noise": bg_noise,
+                    }
+    elif option == "segmentation1":
+        # cut out the object and compose it with the background noise
+        # segmented foreground image
+        segmented_fg_image = torch.tensor(
+        np.array(
+        foreground_mask.resize(foreground_image.size)
+        )).to(torch.bool).unsqueeze(-1) * torch.tensor(
+            np.array(foreground_image)
+            )
+        # resize and scale the image to the bounding box
+        reframed_fg_img, resized_mask = place_image_in_bounding_box(
+        segmented_fg_image,
+        (torch.from_numpy(np.array(target_mask)) / 255.0).to(dtype=bool)
+        )
+        reframed_fg_img = Image.fromarray(reframed_fg_img.numpy())
+        #display(reframed_fg_img)
+        resized_mask_img = Image.fromarray((resized_mask.numpy() * 255).astype(np.uint8))
+        # resize and scale the mask itself
+        foreground_mask = foreground_mask.convert("RGB") # to avoid extraction of contours and make work with function
+        reframed_segmentation_mask, resized_mask = place_image_in_bounding_box(
+            torch.from_numpy(np.array(foreground_mask)),
+            (torch.from_numpy(np.array(target_mask)) / 255.0).to(dtype=bool)
+        )
+        reframed_segmentation_mask = reframed_segmentation_mask.numpy()
+        reframed_segmentation_mask_img = Image.fromarray(reframed_segmentation_mask)
+        #print("Placed Segmentation Mask")
+        #display(reframed_segmentation_mask_img)
+        # invert resized & padded image
+        # fg_noise = get_inverted_input_noise(cached_pipe, reframed_fg_img, num_steps=num_inversion_steps)
+        if photoshop_fg_noise:
+            # temporarily convert to apply mask
+            #print("Photoshopping FG IMAGE")
+            seg_mask_temp = torch.from_numpy(reframed_segmentation_mask).bool()
+            bg_temp = torch.tensor(np.array(background_image))
+            fg_temp = torch.tensor(np.array(reframed_fg_img))
+            photoshop_img = Image.fromarray(
+                (bg_temp * (~seg_mask_temp) + fg_temp * seg_mask_temp).numpy()
+            ).convert("RGB")
+            #display(photoshop_img)
+            fg_noise = get_inverted_input_noise(cached_pipe, photoshop_img, num_steps=num_inversion_steps)
+        else:
+            fg_noise = get_inverted_input_noise(cached_pipe, reframed_fg_img, num_steps=num_inversion_steps)
+        bg_noise = get_inverted_input_noise(cached_pipe, background_image, num_steps=num_inversion_steps)
+        bg_noise_init = bg_noise[-1].squeeze(0) if isinstance(bg_noise, list) else bg_noise
+        fg_noise_init = fg_noise[-1].squeeze(0) if isinstance(fg_noise, list) else fg_noise
+        # overwrite background in resized mask
+        # convert mask from 512x512x3 to 512x512 first
+        reframed_segmentation_mask = reframed_segmentation_mask[:, :, 0]
+        reframed_segmentation_mask = torch.from_numpy(reframed_segmentation_mask).to(dtype=bool)
+        latent_mask = resize_bounding_box(
+            reframed_segmentation_mask,
+            target_size=latents,
+        ).flatten().unsqueeze(-1).to("cuda")
+        bb_mask = resize_bounding_box(
+            resized_mask,
+            target_size=latents,
+        ).flatten().unsqueeze(-1).to("cuda")
+        # compose noise
+        composed_noise = bg_noise_init * (~latent_mask) + fg_noise_init * latent_mask
+        all_latent_masks = {
+            "latent_segmentation_mask": latent_mask,
+            # FIXME: handle bounding box better (making sure shapes are correct, especially when bg and fg images have different sizes, e.g. test image 69)
+            "bb_mask": bb_mask,
+            }
+        all_noise = {
+                "composed_noise": composed_noise,
+                "foreground_noise": fg_noise_init,
+                "background_noise": bg_noise_init,
+                "foreground_noise_list": fg_noise if isinstance(fg_noise, list) else None,
+                "background_noise_list": bg_noise if isinstance(bg_noise, list) else None,
+        }
+    elif option == "segmentation2":
+        # add random noise in the background
+        # segmented foreground image
+        segmented_fg_image = torch.tensor(
+        np.array(
+        foreground_mask.resize(foreground_image.size)
+        )).to(torch.bool).unsqueeze(-1) * torch.tensor(
+            np.array(foreground_image)
+            )
+        # resize and scale the image to the bounding box
+        reframed_fg_img, resized_mask = place_image_in_bounding_box(
+        segmented_fg_image,
+        (torch.from_numpy(np.array(target_mask)) / 255.0).to(dtype=bool)
+        )
+        #print("Segmented and Placed FG Image")
+        reframed_fg_img = Image.fromarray(reframed_fg_img.numpy())
+        #display(reframed_fg_img)
+        # resize and scale the mask itself
+        foreground_mask = foreground_mask.convert("RGB")
+        reframed_segmentation_mask, resized_mask = place_image_in_bounding_box(
+            torch.from_numpy(np.array(foreground_mask)),
+            (torch.from_numpy(np.array(target_mask)) / 255.0).to(dtype=bool)
+        )
+        reframed_segmentation_mask = reframed_segmentation_mask.numpy()
+        reframed_segmentation_mask_img = Image.fromarray(reframed_segmentation_mask)
+        #print("Reframed Segmentation Mask")
+        #display(reframed_segmentation_mask_img)
+        xor_mask = target_mask ^ np.array(reframed_segmentation_mask_img.convert("L"))
+        #print("XOR Mask")
+        #display(Image.fromarray(xor_mask))
+        # invert resized & padded image
+        # fg_noise = get_inverted_input_noise(cached_pipe, reframed_fg_img, num_steps=num_inversion_steps)
+        if photoshop_fg_noise:
+            #print("Photoshopping FG IMAGE")
+            # temporarily convert to apply mask
+            seg_mask_temp = torch.from_numpy(reframed_segmentation_mask).bool()
+            bg_temp = torch.tensor(np.array(background_image))
+            fg_temp = torch.tensor(np.array(reframed_fg_img))
+            photoshop_img = Image.fromarray(
+                (bg_temp * (~seg_mask_temp) + fg_temp * seg_mask_temp).numpy()
+            ).convert("RGB")
+            #display(photoshop_img)
+            fg_noise = get_inverted_input_noise(cached_pipe, photoshop_img, num_steps=num_inversion_steps)
+        else:
+            fg_noise = get_inverted_input_noise(cached_pipe, reframed_fg_img, num_steps=num_inversion_steps)
+        bg_noise = get_inverted_input_noise(cached_pipe, background_image, num_steps=num_inversion_steps)
+        # overwrite background in resized mask
+        # convert mask from 512x512x3 to 512x512
+        reframed_segmentation_mask = reframed_segmentation_mask[:, :, 0]
+        reframed_segmentation_mask = torch.from_numpy(reframed_segmentation_mask).to(dtype=bool)
+        # get all masks in latents and move to device
+        latent_seg_mask = resize_bounding_box(
+            reframed_segmentation_mask,
+            target_size=latents,
+        ).flatten().unsqueeze(-1).to("cuda")
+        print(latent_seg_mask.shape)
+        latent_xor_mask = resize_bounding_box(
+            torch.from_numpy(xor_mask),
+            target_size=latents,
+        ).flatten().unsqueeze(-1).to("cuda")
+        print(resized_mask.shape)
+        latent_target_mask = resize_bounding_box(
+            resized_mask,
+            target_size=latents,
+        ).flatten().unsqueeze(-1).to("cuda")
+        # implement x∗T = xrT ⊙Mseg +xmT ⊙(1−Muser)+z⊙(Muser ⊕Mseg)
+        bg_noise_init = bg_noise[-1].squeeze(0) if isinstance(bg_noise, list) else bg_noise
+        fg_noise_init = fg_noise[-1].squeeze(0) if isinstance(fg_noise, list) else fg_noise
+        bg = bg_noise_init[-1] * (~latent_target_mask)
+        fg = fg_noise_init[-1] * latent_seg_mask
+        boundary = latent_xor_mask * torch.randn(latent_xor_mask.shape).to("cuda")
+        composed_noise = bg + fg + boundary
+        all_latent_masks = {
+            "latent_target_mask": latent_target_mask,
+            "latent_segmentation_mask": latent_seg_mask,
+            "latent_xor_mask": latent_xor_mask,
+                            }
+        all_noise = {
+                "composed_noise": composed_noise,
+                "foreground_noise": fg_noise_init,
+                "background_noise": bg_noise_init,
+                "foreground_noise_list": fg_noise if isinstance(fg_noise, list) else None,
+                "background_noise_list": bg_noise if isinstance(bg_noise, list) else None,
+                    }
+    # always add latent bbox mask (for bg consistency or any other future application)
+    latent_bbox_mask = resize_bounding_box(
+        torch.from_numpy(np.array(target_mask.resize(background_image.size))), # reseize just to be sure
+        target_size=latents,
+    ).flatten().unsqueeze(-1).to("cuda")
+    all_latent_masks["latent_bbox_mask"] = latent_bbox_mask
+    # always add latent segmentation mkas
+    reframed_fg_img, resized_mask = place_image_in_bounding_box(
+        torch.from_numpy(np.array(foreground_image)),
+        (torch.from_numpy(np.array(target_mask)) / 255.0).to(dtype=bool)
+        )
+    bb_mask = resize_bounding_box(
+            resized_mask,
+            target_size=latents,
+        ).flatten().unsqueeze(-1).to("cuda")
+    all_latent_masks["latent_segmentation_mask"] = bb_mask
+    # output
+    return {
+        "noise": all_noise,
+        "latent_masks": all_latent_masks,
+            }

SDLens/cache_and_edit/metrics.py ADDED Viewed

	@@ -0,0 +1,116 @@

+import numpy as np
+from PIL import Image
+from typing import Union
+import torch
+from transformers import CLIPProcessor, CLIPModel
+import torch.nn.functional as F
+from transformers import AutoModel, AutoImageProcessor
+def masked_mse_tiled_mask(
+    image1_pil: Image.Image,
+    image2_pil: Image.Image,
+    tile_mask: Union[np.ndarray, torch.Tensor],
+    tile_size: int = 16
+) -> float:
+    # Convert images to float32 numpy arrays, normalized [0, 1]
+    img1 = np.asarray(image1_pil).astype(np.float32) / 255.0
+    img2 = np.asarray(image2_pil).astype(np.float32) / 255.0
+    # Convert mask to numpy if it's a torch tensor
+    if isinstance(tile_mask, torch.Tensor):
+        tile_mask = tile_mask.detach().cpu().numpy()
+    tile_mask = tile_mask.astype(np.float32)
+    # Upsample mask using np.kron to match image resolution
+    upsampled_mask = np.expand_dims(np.kron(tile_mask, np.ones((tile_size, tile_size), dtype=np.float32)), axis=-1)
+    # Invert mask: 1 = exclude → 0; 0 = include → 1
+    include_mask = 1.0 - upsampled_mask
+    # Compute squared difference
+    diff_squared = (img1 - img2) ** 2
+    masked_diff = diff_squared * include_mask
+    # Sum and normalize by valid (included) pixels
+    valid_pixel_count = np.sum(include_mask)
+    if valid_pixel_count == 0:
+        raise ValueError("All pixels are masked out. Cannot compute MSE.")
+    mse = np.sum(masked_diff) / valid_pixel_count
+    return float(mse)
+def compute_clip_similarity(image: Image.Image, prompt: str) -> float:
+    """
+    Compute CLIP similarity between a PIL image and a text prompt.
+    Loads CLIP model only once and caches it.
+    Args:
+        image (PIL.Image.Image): Input image.
+        prompt (str): Text prompt.
+    Returns:
+        float: Cosine similarity between image and text.
+    """
+    if not hasattr(compute_clip_similarity, "model"):
+        compute_clip_similarity.model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14")
+        compute_clip_similarity.processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")
+        compute_clip_similarity.model.eval()
+    model = compute_clip_similarity.model
+    processor = compute_clip_similarity.processor
+    image = image.convert("RGB")
+    image_inputs = processor(images=image, return_tensors="pt")
+    text_inputs = processor(text=[prompt], return_tensors="pt")
+    with torch.no_grad():
+        image_features = model.get_image_features(**image_inputs)
+        text_features = model.get_text_features(**text_inputs)
+        image_features = F.normalize(image_features, p=2, dim=-1)
+        text_features = F.normalize(text_features, p=2, dim=-1)
+        similarity = (image_features @ text_features.T).item()
+    return similarity
+def compute_dinov2_similarity(image1: Image.Image, image2: Image.Image) -> float:
+    """
+    Compute perceptual similarity between two images using DINOv2 embeddings.
+    Args:
+        image1 (PIL.Image.Image): First image.
+        image2 (PIL.Image.Image): Second image.
+    Returns:
+        float: Cosine similarity between DINOv2 embeddings of the images.
+    """
+    # Load model and processor only once
+    if not hasattr(compute_dinov2_similarity, "model"):
+        compute_dinov2_similarity.processor = AutoImageProcessor.from_pretrained("facebook/dinov2-base")
+        compute_dinov2_similarity.model = AutoModel.from_pretrained("facebook/dinov2-base")
+        compute_dinov2_similarity.model.eval()
+    processor = compute_dinov2_similarity.processor
+    model = compute_dinov2_similarity.model
+    # Preprocess both images
+    inputs = processor(images=[image1.convert("RGB"), image2.convert("RGB")], return_tensors="pt")
+    with torch.no_grad():
+        outputs = model(**inputs)
+        features = outputs.last_hidden_state.mean(dim=1)  # [CLS] or mean-pooled features
+        # Normalize
+        features = F.normalize(features, p=2, dim=-1)
+        # Cosine similarity
+        similarity = (features[0] @ features[1].T).item()
+    return similarity

SDLens/cache_and_edit/qkv_cache.py ADDED Viewed

	@@ -0,0 +1,557 @@

+# Add parent directory to sys.path
+from collections import defaultdict
+import gc
+import os, sys
+from pathlib import Path
+from SDLens.cache_and_edit.flux_pipeline import EditedFluxPipeline
+parent_dir = Path.cwd().parent.resolve()
+if str(parent_dir) not in sys.path:
+    sys.path.insert(0, str(parent_dir))
+from typing import Dict, List, Literal, Optional, TypedDict, Type, Union
+import torch
+from diffusers.models.attention_processor import Attention
+from diffusers.models.transformers import FluxTransformer2DModel
+from diffusers import FluxPipeline
+from diffusers.models.embeddings import apply_rotary_emb
+from SDLens.cache_and_edit.hooks import locate_block
+import torch.nn.functional as F
+from diffusers.models.attention_processor import FluxAttnProcessor2_0
+class QKVCache(TypedDict):
+    query: List[torch.Tensor]
+    key: List[torch.Tensor]
+    value: List[torch.Tensor]
+class CachedFluxAttnProcessor2_0:
+    """Attention processor used typically in processing the SD3-like self-attention projections."""
+    def __init__(self, external_cache: QKVCache,
+                 inject_kv: Literal["image", "text", "both"]= None,
+                 text_seq_length: int = 512):
+        """Constructor for Cached attention processor.
+        Args:
+            external_cache (QKVCache): cache to store/inject values.
+            inject_kv (Literal[&quot;image&quot;, &quot;text&quot;, &quot;both&quot;], optional): whether to inject image, text or both streams KV.
+                If None, it does not perform injection but the full cache is stored. Defaults to None.
+        """
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError("FluxAttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
+        self.cache = external_cache
+        self.inject_kv = inject_kv
+        self.text_seq_length = text_seq_length
+        assert all((cache_key in external_cache) for cache_key in {"query", "key", "value"}), "Cache has to contain 'query', 'key' and 'value' keys."
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: torch.FloatTensor = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        image_rotary_emb: Optional[torch.Tensor] = None,
+    ) -> torch.FloatTensor:
+        batch_size, _, _ = hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        # `sample` projections.
+        query = attn.to_q(hidden_states)
+        key = attn.to_k(hidden_states)
+        value = attn.to_v(hidden_states)
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        if attn.norm_q is not None:
+            query = attn.norm_q(query)
+        if attn.norm_k is not None:
+            key = attn.norm_k(key)
+        # the attention in FluxSingleTransformerBlock does not use `encoder_hidden_states`
+        if encoder_hidden_states is not None:
+            # `context` projections.
+            encoder_hidden_states_query_proj = attn.add_q_proj(encoder_hidden_states)
+            encoder_hidden_states_key_proj = attn.add_k_proj(encoder_hidden_states)
+            encoder_hidden_states_value_proj = attn.add_v_proj(encoder_hidden_states)
+            encoder_hidden_states_query_proj = encoder_hidden_states_query_proj.view(
+                batch_size, -1, attn.heads, head_dim
+            ).transpose(1, 2)
+            encoder_hidden_states_key_proj = encoder_hidden_states_key_proj.view(
+                batch_size, -1, attn.heads, head_dim
+            ).transpose(1, 2)
+            encoder_hidden_states_value_proj = encoder_hidden_states_value_proj.view(
+                batch_size, -1, attn.heads, head_dim
+            ).transpose(1, 2)
+            if attn.norm_added_q is not None:
+                encoder_hidden_states_query_proj = attn.norm_added_q(encoder_hidden_states_query_proj)
+            if attn.norm_added_k is not None:
+                encoder_hidden_states_key_proj = attn.norm_added_k(encoder_hidden_states_key_proj)
+            # attention
+            query = torch.cat([encoder_hidden_states_query_proj, query], dim=2)
+            key = torch.cat([encoder_hidden_states_key_proj, key], dim=2)
+            value = torch.cat([encoder_hidden_states_value_proj, value], dim=2)
+        if image_rotary_emb is not None:
+            query = apply_rotary_emb(query, image_rotary_emb)
+            key = apply_rotary_emb(key, image_rotary_emb)
+        # Cache Q, K, V
+        if self.inject_kv == "image":
+            # NOTE: I am replacing key and values only for the image branch
+            # NOTE: in default settings, encoder_hidden_states_key_proh.shape[2] == 512
+            # the first element of the batch is the image whose key and value will be injected into all the other images
+            key[1:, :, self.text_seq_length:] = key[:1, :, self.text_seq_length:]
+            value[1:, :, self.text_seq_length:] = value[:1, :, self.text_seq_length:]
+        elif self.inject_kv == "text":
+            key[1:, :, :self.text_seq_length] = key[:1, :, :self.text_seq_length]
+            value[1:, :, :self.text_seq_length] = value[:1, :, :self.text_seq_length]
+        elif self.inject_kv == "both":
+            key[1:] = key[:1]
+            value[1:] = value[:1]
+        else: # Don't inject, store cache!
+            self.cache["query"].append(query)
+            self.cache["key"].append(key)
+            self.cache["value"].append(value)
+        hidden_states = F.scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+        )
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states = hidden_states.to(query.dtype)
+        if encoder_hidden_states is not None:
+            encoder_hidden_states, hidden_states = (
+                hidden_states[:, : encoder_hidden_states.shape[1]],
+                hidden_states[:, encoder_hidden_states.shape[1] :],
+            )
+            # linear proj
+            hidden_states = attn.to_out[0](hidden_states)
+            # dropout
+            hidden_states = attn.to_out[1](hidden_states)
+            encoder_hidden_states = attn.to_add_out(encoder_hidden_states)
+            return hidden_states, encoder_hidden_states
+        else:
+            return hidden_states
+class CachedFluxAttnProcessor3_0:
+    """Attention processor used typically in processing the SD3-like self-attention projections."""
+    def __init__(self, external_cache: QKVCache,
+                 inject_kv: Literal["image", "text", "both"]= None,
+                 inject_kv_foreground: bool = False,
+                 text_seq_length: int = 512,
+                 q_mask: Optional[torch.Tensor] = None,):
+        """Constructor for Cached attention processor.
+        Args:
+            external_cache (QKVCache): cache to store/inject values.
+            inject_kv (Literal[&quot;image&quot;, &quot;text&quot;, &quot;both&quot;], optional): whether to inject image, text or both streams KV.
+                If None, it does not perform injection but the full cache is stored. Defaults to None.
+        """
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError("FluxAttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
+        self.cache = external_cache
+        self.inject_kv = inject_kv
+        self.inject_kv_foreground = inject_kv_foreground
+        self.text_seq_length = text_seq_length
+        self.q_mask = q_mask
+        assert all((cache_key in external_cache) for cache_key in {"query", "key", "value"}), "Cache has to contain 'query', 'key' and 'value' keys."
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: torch.FloatTensor = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        image_rotary_emb: Optional[torch.Tensor] = None,
+    ) -> torch.FloatTensor:
+        batch_size, _, _ = hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        # `sample` projections.
+        query = attn.to_q(hidden_states)
+        key = attn.to_k(hidden_states)
+        value = attn.to_v(hidden_states)
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        if attn.norm_q is not None:
+            query = attn.norm_q(query)
+        if attn.norm_k is not None:
+            key = attn.norm_k(key)
+        # the attention in FluxSingleTransformerBlock does not use `encoder_hidden_states`
+        if encoder_hidden_states is not None:
+            # `context` projections.
+            encoder_hidden_states_query_proj = attn.add_q_proj(encoder_hidden_states)
+            encoder_hidden_states_key_proj = attn.add_k_proj(encoder_hidden_states)
+            encoder_hidden_states_value_proj = attn.add_v_proj(encoder_hidden_states)
+            encoder_hidden_states_query_proj = encoder_hidden_states_query_proj.view(
+                batch_size, -1, attn.heads, head_dim
+            ).transpose(1, 2)
+            encoder_hidden_states_key_proj = encoder_hidden_states_key_proj.view(
+                batch_size, -1, attn.heads, head_dim
+            ).transpose(1, 2)
+            encoder_hidden_states_value_proj = encoder_hidden_states_value_proj.view(
+                batch_size, -1, attn.heads, head_dim
+            ).transpose(1, 2)
+            if attn.norm_added_q is not None:
+                encoder_hidden_states_query_proj = attn.norm_added_q(encoder_hidden_states_query_proj)
+            if attn.norm_added_k is not None:
+                encoder_hidden_states_key_proj = attn.norm_added_k(encoder_hidden_states_key_proj)
+            # attention
+            query = torch.cat([encoder_hidden_states_query_proj, query], dim=2)
+            key = torch.cat([encoder_hidden_states_key_proj, key], dim=2)
+            value = torch.cat([encoder_hidden_states_value_proj, value], dim=2)
+        # # Cache Q, K, V
+        # if self.inject_kv == "image":
+        #     # NOTE: I am replacing key and values only for the image branch
+        #     # NOTE: in default settings, encoder_hidden_states_key_proh.shape[2] == 512
+        #     # the first element of the batch is the image whose key and value will be injected into all the other images
+        #     key[1:, :, self.text_seq_length:] = key[:1, :, self.text_seq_length:]
+        #     value[1:, :, self.text_seq_length:] = value[:1, :, self.text_seq_length:]
+        # elif self.inject_kv == "text":
+        #     key[1:, :, :self.text_seq_length] = key[:1, :, :self.text_seq_length]
+        #     value[1:, :, :self.text_seq_length] = value[:1, :, :self.text_seq_length]
+        # elif self.inject_kv == "both":
+        #     key[1:] = key[:1]
+        #     value[1:] = value[:1]
+        # else: # Don't inject, store cache!
+        #     self.cache["query"].append(query)
+        #     self.cache["key"].append(key)
+        #     self.cache["value"].append(value)
+        # extend the mask to match key and values dimension:
+        # Shape of mask is: (num_image_tokens, 1)
+        mask = self.q_mask.permute(1, 0).unsqueeze(0).unsqueeze(-1) # Shape: (1, num_image_tokens, 1, 1)
+        # put mask on gpu
+        mask = mask.to(key.device)
+        # first check that we inject only kv in images:
+        if self.inject_kv is not None and self.inject_kv != "image":
+            raise NotImplementedError("Injecting is implemented only for images.")
+        # the second element of the batch is the number of heads
+        # The first element of the batch represents the background image, the second element of the batch
+        # represents the foreground image. The third element represents the image where we want to inject
+        # the key and value of the background image and foreground image according to the query mask.
+        # Inject from background (element 0) where mask is True
+        if image_rotary_emb is not None:
+            query = apply_rotary_emb(query, image_rotary_emb)
+            key = apply_rotary_emb(key, image_rotary_emb)
+        # Get the index range after the text tokens
+        start_idx = self.text_seq_length
+        if self.inject_kv_foreground and self.inject_kv == "image":
+            key[2:, :, start_idx:] = torch.where(mask, key[1:2, :, start_idx:], key[:1, :, start_idx:])
+            value[2:, :, start_idx:] = torch.where(mask, value[1:2, :, start_idx:], value[:1, :, start_idx:])
+        elif self.inject_kv == "image" and not self.inject_kv_foreground:
+            key[2:, :, start_idx:] = torch.where(mask, key[2:, :, start_idx:], key[:1, :, start_idx:])
+            value[2:, :, start_idx:] = torch.where(mask, value[2:, :, start_idx:], value[:1, :, start_idx:])
+        elif self.inject_kv is None and self.inject_kv_foreground:
+            key[2:, :, start_idx:] = torch.where(mask, key[1:2, :, start_idx:], key[2:, :, start_idx:])
+            value[2:, :, start_idx:] = torch.where(mask, value[1:2, :, start_idx:], value[2:, :, start_idx:])
+        hidden_states = F.scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+        )
+        # mask hidden states from bg:
+        # hidden_states = hidden_states_fg[:, :, start_idx:] * mask + hidden_states_bg[:, :, start_idx:] * (~mask)
+        # concatenate the text
+        #hidden_states = torch.cat([hidden_states_bg[:, :, :start_idx], hidden_states], dim=2)
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states = hidden_states.to(query.dtype)
+        if encoder_hidden_states is not None:
+            encoder_hidden_states, hidden_states = (
+                hidden_states[:, : encoder_hidden_states.shape[1]],
+                hidden_states[:, encoder_hidden_states.shape[1] :],
+            )
+            # linear proj
+            hidden_states = attn.to_out[0](hidden_states)
+            # dropout
+            hidden_states = attn.to_out[1](hidden_states)
+            encoder_hidden_states = attn.to_add_out(encoder_hidden_states)
+            return hidden_states, encoder_hidden_states
+        else:
+            return hidden_states
+class QKVCacheFluxHandler:
+    """Used to cache queries, keys and values of a FluxPipeline.
+    """
+    def __init__(self, pipe: Union[FluxPipeline, EditedFluxPipeline],
+                 positions_to_cache: List[str] = None,
+                 positions_to_cache_foreground: List[str] = None,
+                 inject_kv: Literal["image", "text", "both"] = None,
+                 text_seq_length: int = 512,
+                 q_mask: Optional[torch.Tensor] = None,
+                 processor_class: Optional[Type] = CachedFluxAttnProcessor3_0
+                 ):
+        print(type(pipe))
+        if not isinstance(pipe, FluxPipeline) and not isinstance(pipe, EditedFluxPipeline):
+            raise NotImplementedError(f"QKVCache not yet implemented for {type(pipe)}.")
+        self.pipe = pipe
+        if positions_to_cache is not None:
+            self.positions_to_cache = positions_to_cache
+        else:
+            # act on all transformer layers
+            self.positions_to_cache = []
+        if positions_to_cache_foreground is not None:
+            self.positions_to_cache_foreground = positions_to_cache_foreground
+        else:
+            self.positions_to_cache_foreground = []
+        self._cache = {"query": [], "key": [], "value": []}
+        # Set Cached Processor to perform editing
+        all_layers = [f"transformer.transformer_blocks.{i}" for i in range(19)] + \
+                [f"transformer.single_transformer_blocks.{i}" for i in range(38)]
+        for module_name in all_layers:
+            inject_kv =  "image" if module_name in self.positions_to_cache else None
+            inject_kv_foreground = module_name in self.positions_to_cache_foreground
+            module = locate_block(pipe, module_name)
+            module.attn.set_processor(processor_class(external_cache=self._cache,
+                                                                    inject_kv=inject_kv,
+                                                                    inject_kv_foreground=inject_kv_foreground,
+                                                                    text_seq_length=text_seq_length,
+                                                                    q_mask=q_mask,
+                                                                    ))
+    @property
+    def cache(self) -> QKVCache:
+        """Returns a dictionary initialized as {"query": [], "key": [], "value": []}.
+            After calling a forward pass for pipe, queries, keys and values will be
+            appended in the respective list for each layer.
+        Returns:
+        Dict[str, List[torch.Tensor]]: cache dictionary containing 'query', 'key' and 'value'
+        """
+        return self._cache
+    def clear_cache(self) -> None:
+        # TODO: check if we have to force clean GPU memory
+        del(self._cache)
+        gc.collect()              # force Python to clean up unreachable objects
+        torch.cuda.empty_cache()  # tell PyTorch to release unused GPU memory from its cache
+        self._cache = {"query": [], "key": [], "value": []}
+        for module_name in self.positions_to_cache:
+                    module = locate_block(self.pipe, module_name)
+                    module.attn.set_processor(FluxAttnProcessor2_0())
+class TFICONAttnProcessor:
+    """Attention processor used typically in processing the SD3-like self-attention projections."""
+    def __init__(self,
+                 external_cache: QKVCache,
+                 inject_kv: Literal["image", "text", "both"]= None,
+                 inject_kv_foreground: bool = False,
+                 text_seq_length: int = 512,
+                 q_mask: Optional[torch.Tensor] = None,
+                 call_max_times = None,
+                 inject_q = True,
+                 inject_k = True,
+                 inject_v = True,
+                ):
+        """Constructor for Cached attention processor.
+        Args:
+            external_cache (QKVCache): cache to store/inject values.
+            inject_kv (Literal[&quot;image&quot;, &quot;text&quot;, &quot;both&quot;], optional): whether to inject image, text or both streams KV.
+                If None, it does not perform injection but the full cache is stored. Defaults to None.
+        """
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError("FluxAttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
+        self.cache = external_cache
+        self.inject_kv = inject_kv
+        self.inject_kv_foreground = inject_kv_foreground
+        self.text_seq_length = text_seq_length
+        self.q_mask = q_mask
+        self.inject_q = inject_q
+        self.inject_k = inject_k
+        self.inject_v = inject_v
+        self.call_max_times = call_max_times
+        if self.call_max_times is not None:
+            self.num_calls = call_max_times
+        else:
+            self.num_calls = None
+        assert all((cache_key in external_cache) for cache_key in {"query", "key", "value"}), "Cache has to contain 'query', 'key' and 'value' keys."
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: torch.FloatTensor = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        image_rotary_emb: Optional[torch.Tensor] = None,
+    ) -> torch.FloatTensor:
+        batch_size, _, _ = hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        # `sample` projections.
+        query = attn.to_q(hidden_states)
+        key = attn.to_k(hidden_states)
+        value = attn.to_v(hidden_states)
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        if attn.norm_q is not None:
+            query = attn.norm_q(query)
+        if attn.norm_k is not None:
+            key = attn.norm_k(key)
+        # hidden states are the image patches (B, 4096, hidden_dim)
+        # encoder_hidden_states are the text tokens (B, 512, hidden_dim)
+        # the attention in FluxSingleTransformerBlock does not use `encoder_hidden_states`
+        if encoder_hidden_states is not None:
+            # `context` projections.
+            encoder_hidden_states_query_proj = attn.add_q_proj(encoder_hidden_states)
+            encoder_hidden_states_key_proj = attn.add_k_proj(encoder_hidden_states)
+            encoder_hidden_states_value_proj = attn.add_v_proj(encoder_hidden_states)
+            encoder_hidden_states_query_proj = encoder_hidden_states_query_proj.view(
+                batch_size, -1, attn.heads, head_dim
+            ).transpose(1, 2)
+            encoder_hidden_states_key_proj = encoder_hidden_states_key_proj.view(
+                batch_size, -1, attn.heads, head_dim
+            ).transpose(1, 2)
+            encoder_hidden_states_value_proj = encoder_hidden_states_value_proj.view(
+                batch_size, -1, attn.heads, head_dim
+            ).transpose(1, 2)
+            if attn.norm_added_q is not None:
+                encoder_hidden_states_query_proj = attn.norm_added_q(encoder_hidden_states_query_proj)
+            if attn.norm_added_k is not None:
+                encoder_hidden_states_key_proj = attn.norm_added_k(encoder_hidden_states_key_proj)
+            # concat inputs for attention -> (B, num_heads, 512 + 4096, head_dim)
+            query = torch.cat([encoder_hidden_states_query_proj, query], dim=2)
+            key = torch.cat([encoder_hidden_states_key_proj, key], dim=2)
+            value = torch.cat([encoder_hidden_states_value_proj, value], dim=2)
+        # TODO: try first witout mask
+        # Cache Q, K, V
+        # extend the mask to match key and values dimension:
+        # Shape of mask is: (num_image_tokens, 1)
+        mask = self.q_mask.permute(1, 0).unsqueeze(0).unsqueeze(-1) # Shape: (1, num_image_tokens, 1, 1)
+        # put mask on gpu
+        mask = mask.to(key.device)
+        # first check that we inject only kv in images:
+        if self.inject_kv is not None and self.inject_kv != "image":
+            raise NotImplementedError("Injecting is implemented only for images.")
+        # the second element of the batch is the number of heads
+        # The first element of the batch represents the background image, the second element of the batch
+        # represents the foreground image. The third element represents the image where we want to inject
+        # the key and value of the background image and foreground image according to the query mask.
+        # Inject from background (element 0) where mask is True
+        if image_rotary_emb is not None:
+            query = apply_rotary_emb(query, image_rotary_emb)
+            key = apply_rotary_emb(key, image_rotary_emb)
+        # Get the index range after the text tokens
+        start_idx = self.text_seq_length
+        # Batch is formed as follow:
+        # - background image (0)
+        # - foreground image (1)
+        # - composition(s) (2, 3, ...)
+        # Create the combined attention mask, by forming Q_comp and K_comp, taking the Q and K of the background image
+        # when outside of the mask, the one of the foreground image when inside the mask
+        if self.num_calls is None or self.num_calls > 0:
+            if self.inject_kv_foreground:
+                if self.inject_k:
+                    key[2:, :, start_idx:] = torch.where(mask, key[1:2, :, start_idx:], key[0:1, :, start_idx:])
+                if self.inject_q:
+                    query[2:, :, start_idx:] = torch.where(mask, query[1:2, :, start_idx:], query[0:1, :, start_idx:])
+                if self.inject_v:
+                    value[2:, :, start_idx:] = torch.where(mask, value[1:2, :, start_idx:], value[0:1, :, start_idx:])
+            else:
+                if self.inject_k:
+                    key[2:, :, start_idx:] = torch.where(mask, key[2:, :, start_idx:], key[0:1, :, start_idx:])
+                if self.inject_q:
+                    query[2:, :, start_idx:] = torch.where(mask, query[2:, :, start_idx:], query[0:1, :, start_idx:])
+                if self.inject_v:
+                    value[2:, :, start_idx:] = torch.where(mask, value[2:, :, start_idx:], value[0:1, :, start_idx:])
+            if self.num_calls is not None:
+                self.num_calls -= 1
+        # Use the combined attention map to compute attention using V from the composition image
+        hidden_states = F.scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+        )
+        # hidden_states[2:, :, start_idx:] = torch.where(mask, weightage * hidden_states[1:2, :, start_idx:] + (1-weightage) * hidden_states[2:, :, start_idx:], hidden_states[2:, :, start_idx:])
+        # concatenate the text
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states = hidden_states.to(query.dtype)
+        if encoder_hidden_states is not None:
+            encoder_hidden_states, hidden_states = (
+                hidden_states[:, : encoder_hidden_states.shape[1]],
+                hidden_states[:, encoder_hidden_states.shape[1] :],
+            )
+            # linear proj
+            hidden_states = attn.to_out[0](hidden_states)
+            # dropout
+            hidden_states = attn.to_out[1](hidden_states)
+            encoder_hidden_states = attn.to_add_out(encoder_hidden_states)
+            return hidden_states, encoder_hidden_states
+        else:
+            return hidden_states

SDLens/cache_and_edit/scheduler_inversion.py ADDED Viewed

	@@ -0,0 +1,98 @@

+from typing import List, Optional, Tuple, Union
+import torch
+from diffusers.configuration_utils import register_to_config
+from diffusers.schedulers.scheduling_flow_match_euler_discrete import FlowMatchEulerDiscreteScheduler, FlowMatchEulerDiscreteSchedulerOutput
+class FlowMatchEulerDiscreteSchedulerForInversion(FlowMatchEulerDiscreteScheduler):
+    @register_to_config
+    def __init__(self, inverse: bool, **kwargs):
+        super().__init__(**kwargs)
+        self.inverse = inverse
+    def step(
+        self,
+        model_output: torch.FloatTensor,
+        timestep: Union[float, torch.FloatTensor],
+        sample: torch.FloatTensor,
+        s_churn: float = 0.0,
+        s_tmin: float = 0.0,
+        s_tmax: float = float("inf"),
+        s_noise: float = 1.0,
+        generator: Optional[torch.Generator] = None,
+        return_dict: bool = True,
+    ) -> Union[FlowMatchEulerDiscreteSchedulerOutput, Tuple]:
+        """
+        Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion
+        process from the learned model outputs (most often the predicted noise).
+        Args:
+            model_output (`torch.FloatTensor`):
+                The direct output from learned diffusion model.
+            timestep (`float`):
+                The current discrete timestep in the diffusion chain.
+            sample (`torch.FloatTensor`):
+                A current instance of a sample created by the diffusion process.
+            s_churn (`float`):
+            s_tmin  (`float`):
+            s_tmax  (`float`):
+            s_noise (`float`, defaults to 1.0):
+                Scaling factor for noise added to the sample.
+            generator (`torch.Generator`, *optional*):
+                A random number generator.
+            return_dict (`bool`):
+                Whether or not to return a [`~schedulers.scheduling_euler_discrete.EulerDiscreteSchedulerOutput`] or
+                tuple.
+        Returns:
+            [`~schedulers.scheduling_euler_discrete.EulerDiscreteSchedulerOutput`] or `tuple`:
+                If return_dict is `True`, [`~schedulers.scheduling_euler_discrete.EulerDiscreteSchedulerOutput`] is
+                returned, otherwise a tuple is returned where the first element is the sample tensor.
+        """
+        if (
+            isinstance(timestep, int)
+            or isinstance(timestep, torch.IntTensor)
+            or isinstance(timestep, torch.LongTensor)
+        ):
+            raise ValueError(
+                (
+                    "Passing integer indices (e.g. from `enumerate(timesteps)`) as timesteps to"
+                    " `EulerDiscreteScheduler.step()` is not supported. Make sure to pass"
+                    " one of the `scheduler.timesteps` as a timestep."
+                ),
+            )
+        if self.step_index is None:
+            self._init_step_index(timestep)
+        # Upcast to avoid precision issues when computing prev_sample
+        sample = sample.to(torch.float32)
+        sigma = self.sigmas[self.step_index]
+        sigma_next = self.sigmas[self.step_index + 1]
+        if self.inverse:
+            next_sample = sample + (sigma - sigma_next) * model_output
+            # Cast sample back to model compatible dtype
+            next_sample = next_sample.to(model_output.dtype)
+            # upon completion increase step index by one
+            self._step_index -= 1
+            if not return_dict:
+                return (next_sample,)
+            return FlowMatchEulerDiscreteSchedulerOutput(prev_sample=next_sample)
+        else:
+            prev_sample = sample + (sigma_next - sigma) * model_output
+            # Cast sample back to model compatible dtype
+            prev_sample = prev_sample.to(model_output.dtype)
+            # upon completion increase step index by one
+            self._step_index += 1
+            if not return_dict:
+                return (prev_sample,)
+            return FlowMatchEulerDiscreteSchedulerOutput(prev_sample=prev_sample)

SDLens/hooked_scheduler.py ADDED Viewed

	@@ -0,0 +1,40 @@

+from diffusers import DDPMScheduler
+import torch
+class HookedNoiseScheduler:
+    scheduler: DDPMScheduler
+    pre_hooks: list
+    post_hooks: list
+    def __init__(self, scheduler):
+        object.__setattr__(self, 'scheduler', scheduler)
+        object.__setattr__(self, 'pre_hooks', [])
+        object.__setattr__(self, 'post_hooks', [])
+    def step(
+        self,
+        model_output, timestep, sample, generator, return_dict
+    ):
+        assert return_dict == False, "return_dict == True is not implemented"
+        for hook in self.pre_hooks:
+            hook_output = hook(model_output, timestep, sample, generator)
+            if hook_output is not None:
+                model_output, timestep, sample, generator = hook_output
+        (pred_prev_sample, ) = self.scheduler.step(model_output, timestep, sample, generator, return_dict)
+        for hook in self.post_hooks:
+            hook_output = hook(pred_prev_sample)
+            if hook_output is not None:
+                pred_prev_sample = hook_output
+        return (pred_prev_sample, )
+    def __getattr__(self, name):
+        return getattr(self.scheduler, name)
+    def __setattr__(self, name, value):
+        if name in {'scheduler', 'pre_hooks', 'post_hooks'}:
+            object.__setattr__(self, name, value)
+        else:
+            setattr(self.scheduler, name, value)

SDLens/hooked_sd_pipeline.py ADDED Viewed

	@@ -0,0 +1,319 @@

+import einops
+from diffusers import StableDiffusionXLPipeline, IFPipeline
+from typing import List, Dict, Callable, Union
+import torch
+from .hooked_scheduler import HookedNoiseScheduler
+def retrieve(io):
+    if isinstance(io, tuple):
+        if len(io) == 1:
+            return io[0]
+        else:
+            raise ValueError("A tuple should have length of 1")
+    elif isinstance(io, torch.Tensor):
+        return io
+    else:
+        raise ValueError("Input/Output must be a tensor, or 1-element tuple")
+class HookedDiffusionAbstractPipeline:
+    parent_cls = None
+    pipe = None
+    def __init__(self, pipe: parent_cls, use_hooked_scheduler: bool = False):
+        if use_hooked_scheduler:
+            pipe.scheduler = HookedNoiseScheduler(pipe.scheduler)
+        self.__dict__['pipe'] = pipe
+        self.use_hooked_scheduler = use_hooked_scheduler
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        return cls(cls.parent_cls.from_pretrained(*args, **kwargs))
+    def run_with_hooks(self,
+        *args,
+        position_hook_dict: Dict[str, Union[Callable, List[Callable]]],
+        **kwargs
+    ):
+        '''
+        Run the pipeline with hooks at specified positions.
+        Returns the final output.
+        Args:
+            *args: Arguments to pass to the pipeline.
+            position_hook_dict: A dictionary mapping positions to hooks.
+                The keys are positions in the pipeline where the hooks should be registered.
+                The values are either a single hook or a list of hooks to be registered at the specified position.
+                Each hook should be a callable that takes three arguments: (module, input, output).
+            **kwargs: Keyword arguments to pass to the pipeline.
+        '''
+        hooks = []
+        for position, hook in position_hook_dict.items():
+            if isinstance(hook, list):
+                for h in hook:
+                    hooks.append(self._register_general_hook(position, h))
+            else:
+                hooks.append(self._register_general_hook(position, hook))
+        hooks = [hook for hook in hooks if hook is not None]
+        try:
+            output = self.pipe(*args, **kwargs)
+        finally:
+            for hook in hooks:
+                hook.remove()
+            if self.use_hooked_scheduler:
+                self.pipe.scheduler.pre_hooks = []
+                self.pipe.scheduler.post_hooks = []
+        return output
+    def run_with_cache(self,
+        *args,
+        positions_to_cache: List[str],
+        save_input: bool = False,
+        save_output: bool = True,
+        **kwargs
+    ):
+        '''
+        Run the pipeline with caching at specified positions.
+        This method allows you to cache the intermediate inputs and/or outputs of the pipeline
+        at certain positions. The final output of the pipeline and a dictionary of cached values
+        are returned.
+        Args:
+            *args: Arguments to pass to the pipeline.
+            positions_to_cache (List[str]): A list of positions in the pipeline where intermediate
+                inputs/outputs should be cached.
+            save_input (bool, optional): If True, caches the input at each specified position.
+                Defaults to False.
+            save_output (bool, optional): If True, caches the output at each specified position.
+                Defaults to True.
+            **kwargs: Keyword arguments to pass to the pipeline.
+        Returns:
+            final_output: The final output of the pipeline after execution.
+            cache_dict (Dict[str, Dict[str, Any]]): A dictionary where keys are the specified positions
+                and values are dictionaries containing the cached 'input' and/or 'output' at each position,
+                depending on the flags `save_input` and `save_output`.
+        '''
+        cache_input, cache_output = dict() if save_input else None, dict() if save_output else None
+        hooks = [
+            self._register_cache_hook(position, cache_input, cache_output) for position in positions_to_cache
+        ]
+        hooks = [hook for hook in hooks if hook is not None]
+        output = self.pipe(*args, **kwargs)
+        for hook in hooks:
+            hook.remove()
+        if self.use_hooked_scheduler:
+            self.pipe.scheduler.pre_hooks = []
+            self.pipe.scheduler.post_hooks = []
+        cache_dict = {}
+        if save_input:
+            for position, block in cache_input.items():
+                cache_input[position] = torch.stack(block, dim=1)
+            cache_dict['input'] = cache_input
+        if save_output:
+            for position, block in cache_output.items():
+                cache_output[position] = torch.stack(block, dim=1)
+            cache_dict['output'] = cache_output
+        return output, cache_dict
+    def run_with_hooks_and_cache(self,
+        *args,
+        position_hook_dict: Dict[str, Union[Callable, List[Callable]]],
+        positions_to_cache: List[str] = [],
+        save_input: bool = False,
+        save_output: bool = True,
+        **kwargs
+    ):
+        '''
+        Run the pipeline with hooks and caching at specified positions.
+        This method allows you to register hooks at certain positions in the pipeline and
+        cache intermediate inputs and/or outputs at specified positions. Hooks can be used
+        for inspecting or modifying the pipeline's execution, and caching stores intermediate
+        values for later inspection or use.
+        Args:
+            *args: Arguments to pass to the pipeline.
+            position_hook_dict Dict[str, Union[Callable, List[Callable]]]:
+                A dictionary where the keys are the positions in the pipeline, and the values
+                are hooks (either a single hook or a list of hooks) to be registered at those positions.
+                Each hook should be a callable that accepts three arguments: (module, input, output).
+            positions_to_cache (List[str], optional): A list of positions in the pipeline where
+                intermediate inputs/outputs should be cached. Defaults to an empty list.
+            save_input (bool, optional): If True, caches the input at each specified position.
+                Defaults to False.
+            save_output (bool, optional): If True, caches the output at each specified position.
+                Defaults to True.
+            **kwargs: Additional keyword arguments to pass to the pipeline.
+        Returns:
+            final_output: The final output of the pipeline after execution.
+            cache_dict (Dict[str, Dict[str, Any]]): A dictionary where keys are the specified positions
+                and values are dictionaries containing the cached 'input' and/or 'output' at each position,
+                depending on the flags `save_input` and `save_output`.
+        '''
+        cache_input, cache_output = dict() if save_input else None, dict() if save_output else None
+        hooks = [
+            self._register_cache_hook(position, cache_input, cache_output) for position in positions_to_cache
+        ]
+        for position, hook in position_hook_dict.items():
+            if isinstance(hook, list):
+                for h in hook:
+                    hooks.append(self._register_general_hook(position, h))
+            else:
+                hooks.append(self._register_general_hook(position, hook))
+        hooks = [hook for hook in hooks if hook is not None]
+        output = self.pipe(*args, **kwargs)
+        for hook in hooks:
+            hook.remove()
+        if self.use_hooked_scheduler:
+            self.pipe.scheduler.pre_hooks = []
+            self.pipe.scheduler.post_hooks = []
+        cache_dict = {}
+        if save_input:
+            for position, block in cache_input.items():
+                cache_input[position] = torch.stack(block, dim=1)
+            cache_dict['input'] = cache_input
+        if save_output:
+            for position, block in cache_output.items():
+                cache_output[position] = torch.stack(block, dim=1)
+            cache_dict['output'] = cache_output
+        return output, cache_dict
+    def _locate_block(self, position: str):
+        '''
+        Locate the block at the specified position in the pipeline.
+        '''
+        block = self.pipe
+        for step in position.split('.'):
+            if step.isdigit():
+                step = int(step)
+                block = block[step]
+            else:
+                block = getattr(block, step)
+        return block
+    def _register_cache_hook(self, position: str, cache_input: Dict, cache_output: Dict):
+        if position.endswith('$self_attention') or position.endswith('$cross_attention'):
+            return self._register_cache_attention_hook(position, cache_output)
+        if position == 'noise':
+            def hook(model_output, timestep, sample, generator):
+                if position not in cache_output:
+                    cache_output[position] = []
+                cache_output[position].append(sample)
+            if self.use_hooked_scheduler:
+                self.pipe.scheduler.post_hooks.append(hook)
+            else:
+                raise ValueError('Cannot cache noise without using hooked scheduler')
+            return
+        block = self._locate_block(position)
+        def hook(module, input, kwargs, output):
+            if cache_input is not None:
+                if position not in cache_input:
+                    cache_input[position] = []
+                cache_input[position].append(retrieve(input))
+            if cache_output is not None:
+                if position not in cache_output:
+                    cache_output[position] = []
+                cache_output[position].append(retrieve(output))
+        return block.register_forward_hook(hook, with_kwargs=True)
+    def _register_cache_attention_hook(self, position, cache):
+        attn_block = self._locate_block(position.split('$')[0])
+        if position.endswith('$self_attention'):
+            attn_block = attn_block.attn1
+        elif position.endswith('$cross_attention'):
+            attn_block = attn_block.attn2
+        else:
+            raise ValueError('Wrong attention type')
+        def hook(module, args, kwargs, output):
+            hidden_states = args[0]
+            encoder_hidden_states = kwargs['encoder_hidden_states']
+            attention_mask = kwargs['attention_mask']
+            batch_size, sequence_length, _ = hidden_states.shape
+            attention_mask = attn_block.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+            query = attn_block.to_q(hidden_states)
+            if encoder_hidden_states is None:
+                encoder_hidden_states = hidden_states
+            elif attn_block.norm_cross is not None:
+                encoder_hidden_states = attn_block.norm_cross(encoder_hidden_states)
+            key = attn_block.to_k(encoder_hidden_states)
+            value = attn_block.to_v(encoder_hidden_states)
+            query = attn_block.head_to_batch_dim(query)
+            key = attn_block.head_to_batch_dim(key)
+            value = attn_block.head_to_batch_dim(value)
+            attention_probs = attn_block.get_attention_scores(query, key, attention_mask)
+            attention_probs = attention_probs.view(
+                batch_size,
+                attention_probs.shape[0] // batch_size,
+                attention_probs.shape[1],
+                attention_probs.shape[2]
+            )
+            if position not in cache:
+                cache[position] = []
+            cache[position].append(attention_probs)
+        return attn_block.register_forward_hook(hook, with_kwargs=True)
+    def _register_general_hook(self, position, hook):
+        if position == 'scheduler_pre':
+            if not self.use_hooked_scheduler:
+                raise ValueError('Cannot register hooks on scheduler without using hooked scheduler')
+            self.pipe.scheduler.pre_hooks.append(hook)
+            return
+        elif position == 'scheduler_post':
+            if not self.use_hooked_scheduler:
+                raise ValueError('Cannot register hooks on scheduler without using hooked scheduler')
+            self.pipe.scheduler.post_hooks.append(hook)
+            return
+        block = self._locate_block(position)
+        return block.register_forward_hook(hook)
+    def to(self, *args, **kwargs):
+        self.pipe = self.pipe.to(*args, **kwargs)
+        return self
+    def __getattr__(self, name):
+        return getattr(self.pipe, name)
+    def __setattr__(self, name, value):
+        return setattr(self.pipe, name, value)
+    def __call__(self, *args, **kwargs):
+        return self.pipe(*args, **kwargs)
+class HookedStableDiffusionXLPipeline(HookedDiffusionAbstractPipeline):
+    parent_cls = StableDiffusionXLPipeline
+class HookedIFPipeline(HookedDiffusionAbstractPipeline):
+    parent_cls = IFPipeline

app.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

app.py ADDED Viewed

	@@ -0,0 +1,768 @@

+from functools import partial
+import json
+import gradio as gr
+import os
+# environment
+os.environ['HF_HOME'] = '/dlabscratch1/anmari'
+os.environ['TRANSFORMERS_CACHE'] = '/dlabscratch1/anmari'
+os.environ['HF_DATASETS_CACHE'] = '/dlabscratch1/anmari'
+# os.environ["HF_TOKEN"] = ""
+import torch
+from PIL import Image
+from SDLens import HookedStableDiffusionXLPipeline, CachedPipeline as CachedFLuxPipeline
+from SDLens.cache_and_edit.flux_pipeline import EditedFluxPipeline
+from SAE import SparseAutoencoder
+from utils import TimedHook, add_feature_on_area_base, replace_with_feature_base, add_feature_on_area_turbo, replace_with_feature_turbo, add_feature_on_area_flux
+import numpy as np
+import matplotlib.pyplot as plt
+from matplotlib.colors import ListedColormap
+import threading
+from einops import rearrange
+import spaces
+# from retrieval import FeatureRetriever
+code_to_block_sd = {
+    "down.2.1": "unet.down_blocks.2.attentions.1",
+    "mid.0": "unet.mid_block.attentions.0",
+    "up.0.1": "unet.up_blocks.0.attentions.1",
+    "up.0.0": "unet.up_blocks.0.attentions.0"
+}
+code_to_block_flux = {"18": "transformer.transformer_blocks.18"}
+FLUX_NAMES = ["black-forest-labs/FLUX.1-schnell", "black-forest-labs/FLUX.1-dev"]
+MODELS_CONFIG = {
+    "stabilityai/stable-diffusion-xl-base-1.0": {
+        "steps": 25,
+        "guidance_scale": 8.0,
+        "choices": ["up.0.1 (style)", "down.2.1 (composition)", "up.0.0 (details)", "mid.0"],
+        "value": "down.2.1 (composition)",
+        "code_to_block": code_to_block_sd,
+        "max_steps": 50,
+        "is_flux": False,
+        "downsample_factor": 16,
+        "add_feature_on_area": add_feature_on_area_base,
+        "num_features": 5120,
+    },
+    "stabilityai/sdxl-turbo": {
+        "steps": 1,
+        "guidance_scale": 0.0,
+        "choices": ["up.0.1 (style)", "down.2.1 (composition)", "up.0.0 (details)", "mid.0"],
+        "value": "down.2.1 (composition)",
+        "code_to_block": code_to_block_sd,
+        "max_steps": 4,
+        "is_flux": False,
+        "downsample_factor": 32,
+        "add_feature_on_area": add_feature_on_area_turbo,
+        "num_features": 5120,
+    },
+    "black-forest-labs/FLUX.1-schnell": {
+        "steps": 1,
+        "guidance_scale": 0.0,
+        "choices": ["18"],
+        "value": "18",
+        "code_to_block": code_to_block_flux,
+        "max_steps": 4,
+        "is_flux": True,
+        "exclude_list": [2462, 2974, 1577, 786, 3188, 9986, 4693, 8472, 8248, 325, 9596, 2813, 10803, 11773, 11410, 1067, 2965, 10488, 4537, 2102],
+        "downsample_factor": 8,
+        "add_feature_on_area": add_feature_on_area_flux,
+        "num_features": 12288
+    },
+    "black-forest-labs/FLUX.1-dev": {
+        "steps": 25,
+        "guidance_scale": 0.0,
+        "choices": ["18"],
+        "value": "18",
+        "code_to_block": code_to_block_flux,
+        "max_steps": 50,
+        "is_flux": True,
+        "exclude_list": [2462, 2974, 1577, 786, 3188, 9986, 4693, 8472, 8248, 325, 9596, 2813, 10803, 11773, 11410, 1067, 2965, 10488, 4537, 2102],
+        "downsample_factor": 8,
+        "add_feature_on_area": add_feature_on_area_flux,
+        "num_features": 12288
+    }
+}
+lock = threading.Lock()
+def process_cache(cache, saes_dict, model_config, timestep=None):
+    top_features_dict = {}
+    sparse_maps_dict = {}
+    for code in model_config['code_to_block'].keys():
+        block = model_config["code_to_block"][code]
+        sae = saes_dict[code]
+        if model_config["is_flux"]:
+            with torch.no_grad():
+                features = sae.encode(torch.stack(cache.image_activation))  # shape: [timestep, batch, seq_len, num_features]
+                features[..., model_config["exclude_list"]] = 0
+            if timestep is not None and timestep < features.shape[0]:
+                features = features[timestep:timestep+1]
+            # I want to get [batch, timestep, 64, 64, num_features]
+            sparse_maps = rearrange(features, "t b (w h) n -> b t w h n", w=64, h=64).squeeze(0).squeeze(0)
+        else:
+            diff = cache["output"][block] - cache["input"][block]
+            if diff.shape[0] == 2: # guidance is on and we need to select the second output
+                diff = diff[1].unsqueeze(0)
+            # If a specific timestep is provided, select that timestep from the cached activations
+            if timestep is not None and timestep < diff.shape[1]:
+                diff = diff[:, timestep:timestep+1]
+            diff = diff.permute(0, 1, 3, 4, 2).squeeze(0).squeeze(0)
+            with torch.no_grad():
+                sparse_maps = sae.encode(diff)
+        averages = torch.mean(sparse_maps, dim=(0, 1))
+        top_features = torch.topk(averages, 10).indices
+        top_features_dict[code] = top_features.cpu().tolist()
+        sparse_maps_dict[code] = sparse_maps.cpu().numpy()
+    return top_features_dict, sparse_maps_dict
+def plot_image_heatmap(cache, block_select, radio, model_config):
+    code = block_select.split()[0]
+    feature = int(radio)
+    heatmap = cache["heatmaps"][code][:, :, feature]
+    scaling_factor = 16 if model_config["is_flux"] else 32
+    heatmap = np.kron(heatmap, np.ones((scaling_factor, scaling_factor)))
+    image = cache["image"].convert("RGBA")
+    jet = plt.cm.jet
+    cmap = jet(np.arange(jet.N))
+    cmap[:1, -1] = 0
+    cmap[1:, -1] = 0.6
+    cmap = ListedColormap(cmap)
+    heatmap = (heatmap - np.min(heatmap)) / (np.max(heatmap) - np.min(heatmap))
+    heatmap_rgba = cmap(heatmap)
+    heatmap_image = Image.fromarray((heatmap_rgba * 255).astype(np.uint8))
+    heatmap_with_transparency = Image.alpha_composite(image, heatmap_image)
+    return heatmap_with_transparency
+def create_prompt_part(pipe, saes_dict, demo):
+    model_config = MODELS_CONFIG[pipe.pipe.name_or_path]
+    @spaces.GPU
+    def image_gen(prompt, timestep=None, num_steps=None, guidance_scale=None):
+        lock.acquire()
+        try:
+            # Default values
+            default_n_steps = model_config["steps"]
+            default_guidance = model_config["guidance_scale"]
+            # Use provided values if available, otherwise use defaults
+            n_steps = default_n_steps if num_steps is None else int(num_steps)
+            guidance = default_guidance if guidance_scale is None else float(guidance_scale)
+            # Convert timestep to integer if it's not None
+            timestep_int = None if timestep is None else int(timestep)
+            if "FLUX" in pipe.pipe.name_or_path:
+                images = pipe.run(
+                    prompt,
+                    num_inference_steps=n_steps,
+                    width=1024,
+                    height=1024,
+                    cache_activations=True,
+                    guidance_scale=guidance,
+                    positions_to_cache = list(model_config["code_to_block"].values()),
+                    inverse=False,
+                )
+                cache = pipe.activation_cache
+            else:
+                images, cache = pipe.run_with_cache(
+                    prompt,
+                    positions_to_cache=list(model_config["code_to_block"].values()),
+                    num_inference_steps=n_steps,
+                    generator=torch.Generator(device="cpu").manual_seed(42),
+                    guidance_scale=guidance,
+                    save_input=True,
+                    save_output=True
+                )
+        finally:
+            lock.release()
+        top_features_dict, top_sparse_maps_dict = process_cache(cache, saes_dict, model_config, timestep_int)
+        return images.images[0], {
+            "image": images.images[0],
+            "heatmaps": top_sparse_maps_dict,
+            "features": top_features_dict
+        }
+    def update_radio(cache, block_select):
+        code = block_select.split()[0]
+        return gr.update(choices=cache["features"][code])
+    def update_img(cache, block_select, radio):
+        new_img = plot_image_heatmap(cache, block_select, radio, model_config)
+        return new_img
+    with gr.Tab("Explore", elem_classes="tabs") as explore_tab:
+        cache = gr.State(value={
+            "image": None,
+            "heatmaps": None,
+            "features": []
+        })
+        with gr.Row():
+            with gr.Column(scale=7):
+                with gr.Row(equal_height=True):
+                    prompt_field = gr.Textbox(lines=1, label="Enter prompt here", value="A cinematic shot of a professor sloth wearing a tuxedo at a BBQ party and eathing a dish with peas.")
+                    button = gr.Button("Generate", elem_classes="generate_button1")
+                with gr.Row():
+                    image = gr.Image(width=512, height=512, image_mode="RGB", label="Generated image")
+            with gr.Column(scale=4):
+                block_select = gr.Dropdown(
+                    choices=model_config["choices"], # replace this for flux
+                    value=model_config["value"],
+                    label="Select block",
+                    elem_id="block_select",
+                    interactive=True
+                )
+                with gr.Group() as sdxl_base_controls:
+                    steps_slider = gr.Slider(
+                        minimum=1,
+                        maximum=model_config["max_steps"],
+                        value= model_config["steps"],
+                        step=1,
+                        label="Number of steps",
+                        elem_id="steps_slider",
+                        interactive=True,
+                        visible=True
+                    )
+                    # Add timestep selector
+                    # TODO: check this
+                    timestep_selector = gr.Slider(
+                        minimum=0,
+                        maximum=model_config["max_steps"]-1,
+                        value=None,
+                        step=1,
+                        label="Timestep (leave empty for average across all steps)",
+                        elem_id="timestep_selector",
+                        interactive=True,
+                        visible=True,
+                    )
+                    recompute_button = gr.Button("Recompute", elem_id="recompute_button")
+                # Update max timestep when steps change
+                steps_slider.change(lambda s: gr.update(maximum=s-1), [steps_slider], [timestep_selector])
+                radio = gr.Radio(choices=[], label="Select a feature", interactive=True)
+        button.click(image_gen, [prompt_field, timestep_selector, steps_slider], outputs=[image, cache])
+        cache.change(update_radio, [cache, block_select], outputs=[radio])
+        block_select.select(update_radio, [cache, block_select], outputs=[radio])
+        radio.select(update_img, [cache, block_select, radio], outputs=[image])
+        recompute_button.click(image_gen, [prompt_field, timestep_selector, steps_slider], outputs=[image, cache])
+        demo.load(image_gen, [prompt_field, timestep_selector, steps_slider], outputs=[image, cache])
+    return explore_tab
+def downsample_mask(image, factor):
+    downsampled = image.reshape(
+        (image.shape[0] // factor, factor,
+        image.shape[1] // factor, factor)
+    )
+    downsampled = downsampled.mean(axis=(1, 3))
+    return downsampled
+def create_intervene_part(pipe: HookedStableDiffusionXLPipeline, saes_dict, means_dict, demo):
+    model_config = MODELS_CONFIG[pipe.pipe.name_or_path]
+    @spaces.GPU
+    def image_gen(prompt, num_steps, guidance_scale=None):
+        lock.acquire()
+        guidance = model_config["guidance_scale"] if guidance_scale is None else float(guidance_scale)
+        try:
+            if "FLUX" in pipe.pipe.name_or_path:
+                images = pipe.run(
+                    prompt,
+                    num_inference_steps=int(num_steps),
+                    width=1024,
+                    height=1024,
+                    cache_activations=False,
+                    guidance_scale=guidance,
+                    inverse=False,
+                )
+            else:
+                images = pipe.run_with_hooks(
+                    prompt,
+                    position_hook_dict={},
+                    num_inference_steps=int(num_steps),
+                    generator=torch.Generator(device="cpu").manual_seed(42),
+                    guidance_scale=guidance,
+                )
+        finally:
+            lock.release()
+        if images.images[0].size == (1024, 1024):
+            return images.images[0].resize((512, 512))
+        else:
+            return images.images[0]
+    @spaces.GPU
+    def image_mod(prompt, block_str, brush_index, strength, num_steps, input_image, guidance_scale=None, start_index=None, end_index=None):
+        block = block_str.split(" ")[0]
+        mask = (input_image["layers"][0] > 0)[:, :, -1].astype(float)
+        mask = downsample_mask(mask, model_config["downsample_factor"])
+        mask = torch.tensor(mask, dtype=torch.float32, device="cuda")
+        if mask.sum() == 0:
+            gr.Info("No mask selected, please draw on the input image")
+        # Set default values for start_index and end_index if not provided
+        if start_index is None:
+            start_index = 0
+        if end_index is None:
+            end_index = int(num_steps)
+        # Ensure start_index and end_index are within valid ranges
+        start_index = max(0, min(int(start_index), int(num_steps)))
+        end_index = max(0, min(int(end_index), int(num_steps)))
+        # Ensure start_index is less than end_index
+        if start_index >= end_index:
+            start_index = max(0, end_index - 1)
+        def myhook(module, input, output):
+            return model_config["add_feature_on_area"](
+                saes_dict[block],
+                brush_index,
+                mask * means_dict[block][brush_index] * strength,
+                module,
+                input,
+                output)
+        hook = TimedHook(myhook, int(num_steps), np.arange(start_index, end_index))
+        lock.acquire()
+        guidance = model_config["guidance_scale"] if guidance_scale is None else float(guidance_scale)
+        try:
+            if model_config["is_flux"]:
+                 image = pipe.run_with_edit(
+                    prompt,
+                    seed=42,
+                    num_inference_steps=int(num_steps),
+                    edit_fn= lambda input, output: hook(None, input, output),
+                    layers_for_edit_fn=[i for i in range(18, 57)],
+                    stream="image").images[0]
+            else:
+                image = pipe.run_with_hooks(
+                    prompt,
+                    position_hook_dict={model_config["code_to_block"][block]: hook},
+                    num_inference_steps=int(num_steps),
+                    generator=torch.Generator(device="cpu").manual_seed(42),
+                    guidance_scale=guidance
+                ).images[0]
+        finally:
+            lock.release()
+        return image
+    def feature_icon(block_str, brush_index, guidance_scale=None):
+        block = block_str.split(" ")[0]
+        if block in ["mid.0", "up.0.0"]:
+            gr.Info("Note that Feature Icon works best with down.2.1 and up.0.1 blocks but feel free to explore", duration=3)
+        def hook(module, input, output):
+            if is_base_model:
+                return replace_with_feature_base(
+                    saes_dict[block],
+                    brush_index,
+                    means_dict[block][brush_index] * saes_dict[block].k,
+                    module,
+                    input,
+                    output
+                )
+            else:
+                return replace_with_feature_turbo(
+                    saes_dict[block],
+                    brush_index,
+                    means_dict[block][brush_index] * saes_dict[block].k,
+                    module,
+                    input,
+                    output)
+        lock.acquire()
+        guidance = model_config["guidance_scale"] if guidance_scale is None else float(guidance_scale)
+        try:
+            image = pipe.run_with_hooks(
+                "",
+                position_hook_dict={model_config["code_to_block"][block]: hook},
+                num_inference_steps=model_config["steps"],
+                generator=torch.Generator(device="cpu").manual_seed(42),
+                guidance_scale=guidance,
+            ).images[0]
+        finally:
+            lock.release()
+        return image
+    with gr.Tab("Paint!", elem_classes="tabs") as intervene_tab:
+        image_state = gr.State(value=None)
+        with gr.Row():
+            with gr.Column(scale=3):
+                # Generation column
+                with gr.Row():
+                    # prompt and num_steps
+                    prompt_field = gr.Textbox(lines=1, label="Enter prompt here", value="A dog plays with a ball, cartoon", elem_id="prompt_input")
+                with gr.Row():
+                    num_steps = gr.Number(value=model_config["steps"], label="Number of steps", minimum=1, maximum=model_config["max_steps"], elem_id="num_steps", precision=0)
+                with gr.Row():
+                    # Generate button
+                    button_generate = gr.Button("Generate", elem_id="generate_button")
+            with gr.Column(scale=3):
+                # Intervention column
+                with gr.Row():
+                    # dropdowns and number inputs
+                    with gr.Column(scale=7):
+                        with gr.Row():
+                            block_select = gr.Dropdown(
+                                choices=model_config["choices"],
+                                value=model_config["value"],
+                                label="Select block",
+                                elem_id="block_select"
+                            )
+                            brush_index = gr.Number(value=0, label="Brush index", minimum=0, maximum=model_config["num_features"]-1, elem_id="brush_index", precision=0)
+                        # with gr.Row():
+                        #     button_icon = gr.Button('Feature Icon', elem_id="feature_icon_button")
+                        with gr.Row():
+                            gr.Markdown("**TimedHook Range** (which steps to apply the feature)", visible=True)
+                        with gr.Row():
+                            start_index = gr.Number(value=0, label="Start index", minimum=0, maximum=model_config["max_steps"], elem_id="start_index", precision=0, visible=True)
+                            end_index = gr.Number(value=model_config["steps"], label="End index", minimum=0, maximum=model_config["max_steps"], elem_id="end_index", precision=0, visible=True)
+                    with gr.Column(scale=3):
+                        with gr.Row():
+                            strength = gr.Number(value=10, label="Strength", minimum=-40, maximum=40, elem_id="strength", precision=2)
+                        with gr.Row():
+                            button = gr.Button('Apply', elem_id="apply_button")
+        with gr.Row():
+            with gr.Column():
+                # Input image
+                i_image = gr.Sketchpad(
+                    height=610,
+                    layers=False, transforms=[], placeholder="Generate and paint!",
+                    brush=gr.Brush(default_size=64, color_mode="fixed", colors=['black']),
+                    container=False,
+                    canvas_size=(512, 512),
+                    label="Input Image")
+                clear_button = gr.Button("Clear")
+                clear_button.click(lambda x: x, [image_state], [i_image])
+            # Output image
+            o_image = gr.Image(width=512, height=512, label="Output Image")
+        # Set up the click events
+        button_generate.click(image_gen, inputs=[prompt_field, num_steps], outputs=[image_state])
+        image_state.change(lambda x: x, [image_state], [i_image])
+        # Update max values for start_index and end_index when num_steps changes
+        def update_index_maxes(steps):
+            return gr.update(maximum=steps), gr.update(maximum=steps)
+        num_steps.change(update_index_maxes, [num_steps], [start_index, end_index])
+        button.click(image_mod,
+                    inputs=[prompt_field, block_select, brush_index, strength, num_steps, i_image, start_index, end_index],
+                    outputs=o_image)
+        # button_icon.click(feature_icon, inputs=[block_select, brush_index], outputs=o_image)
+        demo.load(image_gen, [prompt_field, num_steps], outputs=[image_state])
+    return intervene_tab
+def create_top_images_part(demo, pipe):
+    model_config = MODELS_CONFIG[pipe.pipe.name_or_path]
+    if isinstance(pipe, HookedStableDiffusionXLPipeline):
+        is_flux = False
+    elif isinstance(pipe, CachedFLuxPipeline):
+        is_flux = True
+    else:
+        raise AssertionError(f"Unknown pipe class: {type(pipe)}")
+    def update_top_images(block_select, brush_index):
+        block = block_select.split(" ")[0]
+                    # Define path for fetching image
+        if is_flux:
+            part = 1 if brush_index <= 7000 else 2
+            url = f"https://huggingface.co/datasets/antoniomari/flux_sae_images/resolve/main/{block}/part{part}/{brush_index}.jpg"
+        else:
+            url = f"https://huggingface.co/surokpro2/sdxl_sae_images/resolve/main/{block}/{brush_index}.jpg"
+        return url
+    with gr.Tab("Top Images", elem_classes="tabs") as top_images_tab:
+        with gr.Row():
+            block_select = gr.Dropdown(
+                choices=["flux_18"] if is_flux else ["up.0.1 (style)", "down.2.1 (composition)", "up.0.0 (details)", "mid.0"],
+                value="flux_18" if is_flux else "down.2.1 (composition)",
+                label="Select block"
+            )
+            brush_index = gr.Number(value=0, label="Brush index", minimum=0, maximum=model_config["num_features"]-1, precision=0)
+        with gr.Row():
+            image = gr.Image(width=600, height=600, label="Top Images")
+        block_select.select(update_top_images, [block_select, brush_index], outputs=[image])
+        brush_index.change(update_top_images, [block_select, brush_index], outputs=[image])
+        demo.load(update_top_images, [block_select, brush_index], outputs=[image])
+    return top_images_tab
+def create_top_images_plus_search_part(retriever, demo, pipe):
+    model_config = MODELS_CONFIG[pipe.pipe.name_or_path]
+    if isinstance(pipe, HookedStableDiffusionXLPipeline):
+        is_flux = False
+    elif isinstance(pipe, CachedFLuxPipeline):
+        is_flux = True
+    else:
+        raise AssertionError(f"Unknown pipe class: {type(pipe)}")
+    def update_cache(block_select, search_by_text, search_by_index):
+        if search_by_text == "":
+            top_indices = []
+            index = search_by_index
+            block = block_select.split(" ")[0]
+            # Define path for fetching image
+            if is_flux:
+                part = 1 if index <= 7000 else 2
+                url = f"https://huggingface.co/antoniomari/flux_sae_images/resolve/main/{block}/part{part}/{index}.jpg"
+            else:
+                url = f"https://huggingface.co/surokpro2/sdxl_sae_images/resolve/main/{block}/{index}.jpg"
+            return url, {"image": url, "feature_idx": index, "features": top_indices}
+        else:
+            # TODO
+            if retriever is None:
+                raise ValueError("Feature retrieval is not enabled")
+            lock.acquire()
+            try:
+                top_indices = list(retriever.query_text(search_by_text, block_select.split(" ")[0]).keys())
+            finally:
+                lock.release()
+            block = block_select.split(" ")[0]
+            top_indices = list(map(int, top_indices))
+            index = top_indices[0]
+            url = f"https://huggingface.co/surokpro2/sdxl_sae_images/resolve/main/{block}/{index}.jpg"
+            return url, {"image": url, "feature_idx": index, "features": top_indices[:20]}
+    def update_radio(cache):
+        return gr.update(choices=cache["features"], value=cache["feature_idx"])
+    def update_img(cache, block_select, index):
+        block = block_select.split(" ")[0]
+        url = f"https://huggingface.co/surokpro2/sdxl_sae_images/resolve/main/{block}/{index}.jpg"
+        return url
+    with gr.Tab("Top Images", elem_classes="tabs") as explore_tab:
+        cache = gr.State(value={
+            "image": None,
+            "feature_idx": None,
+            "features": []
+        })
+        with gr.Row():
+            with gr.Column(scale=7):
+                with gr.Row():
+                    # top images
+                    image = gr.Image(width=600, height=600, image_mode="RGB", label="Top images")
+            with gr.Column(scale=4):
+                block_select = gr.Dropdown(
+                    choices=["flux_18"] if is_flux else ["up.0.1 (style)", "down.2.1 (composition)", "up.0.0 (details)", "mid.0"],
+                    value="flux_18" if is_flux else "down.2.1 (composition)",
+                    label="Select block",
+                    elem_id="block_select",
+                    interactive=True
+                )
+                search_by_index = gr.Number(value=0, label="Search by index", minimum=0, maximum=model_config["num_features"]-1, precision=0)
+                search_by_text = gr.Textbox(lines=1, label="Search by text", value="", visible=False)
+                radio = gr.Radio(choices=[], label="Select a feature", interactive=True, visible=False)
+        search_by_text.change(update_cache,
+                        [block_select, search_by_text, search_by_index],
+                        outputs=[image, cache])
+        block_select.select(update_cache,
+                        [block_select, search_by_text, search_by_index],
+                        outputs=[image, cache])
+        cache.change(update_radio, [cache], outputs=[radio])
+        radio.select(update_img, [cache, block_select, radio], outputs=[image])
+        search_by_index.change(update_img, [cache, block_select, search_by_index], outputs=[image])
+        demo.load(update_img,
+                  [cache, block_select, search_by_index],
+                  outputs=[image])
+    return explore_tab
+def create_intro_part():
+    with gr.Tab("Instructions", elem_classes="tabs") as intro_tab:
+        gr.Markdown(
+            '''# Unpacking SDXL Turbo with Sparse Autoencoders
+            ## Demo Overview
+            This demo showcases the use of Sparse Autoencoders (SAEs) to understand the features learned by the Stable Diffusion XL Turbo model.
+            ## How to Use
+            ### Explore
+            * Enter a prompt in the text box and click on the "Generate" button to generate an image.
+            * You can observe the active features in different blocks plot on top of the generated image.
+            ### Top Images
+            * For each feature, you can view the top images that activate the feature the most.
+            ### Paint!
+            * Generate an image using the prompt.
+            * Paint on the generated image to apply interventions.
+            * Use the "Feature Icon" button to understand how the selected brush functions.
+            ### Remarks
+            * Not all brushes mix well with all images. Experiment with different brushes and strengths.
+            * Feature Icon works best with `down.2.1 (composition)` and `up.0.1 (style)` blocks.
+            * This demo is provided for research purposes only. We do not take responsibility for the content generated by the demo.
+            ### Interesting features to try
+            To get started, try the following features:
+            - down.2.1 (composition): 2301 (evil) 3747 (image frame) 4998 (cartoon)
+            - up.0.1 (style): 4977 (tiger stripes) 90 (fur) 2615 (twilight blur)
+            '''
+        )
+    return intro_tab
+def create_demo(pipe, saes_dict, means_dict, use_retrieval=True):
+    custom_css = """
+    .tabs button {
+        font-size: 20px !important; /* Adjust font size for tab text */
+        padding: 10px !important;   /* Adjust padding to make the tabs bigger */
+        font-weight: bold !important; /* Adjust font weight to make the text bold */
+    }
+    .generate_button1 {
+        max-width: 160px !important;
+        margin-top: 20px !important;
+        margin-bottom: 20px !important;
+    }
+    """
+    if use_retrieval:
+        retriever = None # FeatureRetriever()
+    else:
+        retriever = None
+    with gr.Blocks(css=custom_css) as demo:
+        # with create_intro_part():
+        #     pass
+        with create_prompt_part(pipe, saes_dict, demo):
+            pass
+        with create_top_images_part(demo, pipe):
+            pass
+        with create_intervene_part(pipe, saes_dict, means_dict, demo):
+            pass
+    return demo
+if __name__ == "__main__":
+    import os
+    import gradio as gr
+    import torch
+    from SDLens import HookedStableDiffusionXLPipeline
+    from SAE import SparseAutoencoder
+    from huggingface_hub import hf_hub_download
+    dtype = torch.float16
+    pipe = EditedFluxPipeline.from_pretrained(
+        "black-forest-labs/FLUX.1-schnell",
+        device_map="balanced",
+        torch_dtype=dtype
+    )
+    pipe.set_progress_bar_config(disable=True)
+    pipe = CachedFLuxPipeline(pipe)
+    # Parameters
+    DEVICE = "cuda"
+    # Hugging Face repo setup
+    HF_REPO_ID = "antoniomari/SAE_flux_18"
+    HF_BRANCH = "main"
+    # Command-line arguments
+    block_code = "18"
+    block_name = code_to_block_flux[block_code]
+    saes_dict = {}
+    means_dict = {}
+    # Download files from the root of the repo
+    state_dict_path = hf_hub_download(
+        repo_id=HF_REPO_ID,
+        filename="state_dict.pth",
+        revision=HF_BRANCH
+    )
+    config_path = hf_hub_download(
+        repo_id=HF_REPO_ID,
+        filename="config.json",
+        revision=HF_BRANCH
+    )
+    mean_path = hf_hub_download(
+        repo_id=HF_REPO_ID,
+        filename="mean.pt",
+        revision=HF_BRANCH
+    )
+    # Load config and model
+    with open(config_path, "r") as f:
+        config = json.load(f)
+    sae = SparseAutoencoder(**config)
+    checkpoint = torch.load(state_dict_path, map_location=DEVICE)
+    state_dict = checkpoint["state_dict"]
+    sae.load_state_dict(state_dict)
+    sae = sae.to(DEVICE, dtype=torch.float16).eval()
+    means = torch.load(mean_path, map_location=DEVICE).to(dtype)
+    saes_dict[block_code] = sae
+    means_dict[block_code] = means
+    demo = create_demo(pipe, saes_dict, means_dict)
+    demo.launch()

checkpoints/unet.down_blocks.2.attentions.1_k10_hidden5120_auxk256_bs4096_lr0.0001/final/config.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"n_dirs_local": 5120, "d_model": 1280, "k": 10, "auxk": 256, "dead_steps_threshold": 2441}

checkpoints/unet.down_blocks.2.attentions.1_k10_hidden5120_auxk256_bs4096_lr0.0001/final/mean.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:387f2b6f8c4e4a6f1227921f28f00dfa4beb2bd4e422b7eb592cd8627af0e58f
+size 21581

checkpoints/unet.down_blocks.2.attentions.1_k10_hidden5120_auxk256_bs4096_lr0.0001/final/state_dict.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:39e3c6d17aa572a53368ca8ba8f82757947a3caf14fe654e84b175d0dc0a4650
+size 52497831

checkpoints/unet.down_blocks.2.attentions.1_k10_hidden5120_auxk256_bs4096_lr0.0001/final/std.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c6ca694c9504a7a8aa827004d3fdec5c1cb8fcf3904acc3562d1861fc6e65c19
+size 21576

checkpoints/unet.mid_block.attentions.0_k10_hidden5120_auxk256_bs4096_lr0.0001/final/config.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"n_dirs_local": 5120, "d_model": 1280, "k": 10, "auxk": 256, "dead_steps_threshold": 2441}

checkpoints/unet.mid_block.attentions.0_k10_hidden5120_auxk256_bs4096_lr0.0001/final/mean.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:80790481d0e56ac3fa36599703cee7a05cfb4cc078db57c8f9180e860c330e1d
+size 21581

checkpoints/unet.mid_block.attentions.0_k10_hidden5120_auxk256_bs4096_lr0.0001/final/state_dict.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:49d38d9178c2a2780e04a5482a2feb9548c6e9a636ed1bf85291acf42e0ffa34
+size 52497831

checkpoints/unet.mid_block.attentions.0_k10_hidden5120_auxk256_bs4096_lr0.0001/final/std.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bb6bfc7ce5e596f8aa048ab262ca56841868c222bf07eb2ed35b6e4f7094fea6
+size 21576

checkpoints/unet.up_blocks.0.attentions.0_k10_hidden5120_auxk256_bs4096_lr0.0001/final/config.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"n_dirs_local": 5120, "d_model": 1280, "k": 10, "auxk": 256, "dead_steps_threshold": 2441}

checkpoints/unet.up_blocks.0.attentions.0_k10_hidden5120_auxk256_bs4096_lr0.0001/final/mean.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:de036d0fb9ee663f7bdf60e4a5d89d038516dae637531676b53ff75d05eab46b
+size 21581

checkpoints/unet.up_blocks.0.attentions.0_k10_hidden5120_auxk256_bs4096_lr0.0001/final/state_dict.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:14c45efd9cce0258f014c49babdcd0e9ce8b266fe31eed72db1a45b990a1a0f8
+size 52497831

checkpoints/unet.up_blocks.0.attentions.0_k10_hidden5120_auxk256_bs4096_lr0.0001/final/std.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cb9c04499ccae041987cc262894e254c2f04288857a8a0470cfb1b86a8ecfa09
+size 21576

checkpoints/unet.up_blocks.0.attentions.1_k10_hidden5120_auxk256_bs4096_lr0.0001/final/config.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"n_dirs_local": 5120, "d_model": 1280, "k": 10, "auxk": 256, "dead_steps_threshold": 2441}

checkpoints/unet.up_blocks.0.attentions.1_k10_hidden5120_auxk256_bs4096_lr0.0001/final/mean.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:96dbf6fffe9d62c3b3352f8e4fe48c54dfd69906cf8ad6828d5ce93db9a5f0dc
+size 21581

checkpoints/unet.up_blocks.0.attentions.1_k10_hidden5120_auxk256_bs4096_lr0.0001/final/state_dict.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f8eed82f4bcb2f010ae9075f10a1ece801ee3dec46dba7fadccc35f6c0a7836b
+size 52497831

checkpoints/unet.up_blocks.0.attentions.1_k10_hidden5120_auxk256_bs4096_lr0.0001/final/std.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fe5c5be0c4c2d2b57e7888319053cb64929559f947c8ce445ddd6a397302afab
+size 21576

colab_requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+diffusers==0.29.2
+gradio==5.23.2
+numpy
+matplotlib
+pillow
+einops
+transformers
+huggingface_hub

example.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

requirements.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+diffusers==0.29.2
+gradio==4.44.1
+torch>=2.4.0
+numpy
+matplotlib
+pillow
+wandb
+einops
+transformers
+accelerate
+huggingface_hub
+git+https://github.com/wendlerc/clip-retrieval.git

resourses/image.png ADDED Viewed

Git LFS Details

SHA256: 86594c5876d61a3eac5238b739eeec41418995c7696b6453d70b4e683ebd82df
Pointer size: 132 Bytes
Size of remote file: 1.12 MB

retrieval.py ADDED Viewed

	@@ -0,0 +1,71 @@

+from huggingface_hub import snapshot_download
+from clip_retrieval.clip_back import load_clip_indices, KnnService, ClipOptions
+from collections import defaultdict
+import os
+import glob
+import shutil
+import random
+class FeatureRetriever:
+    def __init__(self,
+                 num_images=50,
+                 imgs_per_dir=15,
+                 force_download=False):
+        if force_download or not os.path.exists("./clip"):
+            print("Downloading clip resources")
+            rand_num = random.randint(0, 100000)
+            tmp_dir = f"./tmp_{rand_num}"
+            snapshot_download(repo_type="dataset", repo_id="wendlerc/sdxl-unbox-clip-indices", cache_dir=tmp_dir)
+            clip_dirs = glob.glob(f"{tmp_dir}/**/down_10_5120", recursive=True)
+            if len(clip_dirs) > 0:
+                shutil.copytree(clip_dirs[0].replace("down_10_5120", ""), "./clip", dirs_exist_ok=True)
+                shutil.rmtree(tmp_dir)
+            else:
+                ValueError("Could not find clip indices in the downloaded repo.")
+        # Initialize CLIP service
+        clip_options = ClipOptions(
+            indice_folder="currently unused by knn.query()",
+            clip_model="ViT-B/32", #"open_clip:ViT-H-14",
+            enable_hdf5=False,
+            enable_faiss_memory_mapping=True,
+            columns_to_return=["image_path", "similarity"],
+            reorder_metadata_by_ivf_index=False,
+            enable_mclip_option=False,
+            use_jit=False,
+            use_arrow=False,
+            provide_safety_model=False,
+            provide_violence_detector=False,
+            provide_aesthetic_embeddings=False,
+        )
+        self.names = ["down.2.1", "mid.0", "up.0.0", "up.0.1"]
+        self.paths = ["./clip/down_10_5120/indices_paths.json",
+                 "./clip/mid_10_5120/indices_paths.json",
+                 "./clip/up0_10_5120/indices_paths.json",
+                 "./clip/up_10_5120/indices_paths.json",]
+        self.knn_service = {}
+        for name, path in zip(self.names, self.paths):
+            resources = load_clip_indices(path, clip_options)
+            self.knn_service[name] = KnnService(clip_resources=resources)
+        self.num_images = num_images
+        self.imgs_per_dir = imgs_per_dir
+    def query_text(self, query, block):
+        if block not in self.names:
+            raise ValueError(f"Block must be one of {self.names}")
+        results = self.knn_service[block].query(
+            text_input=query,
+            num_images=self.num_images,
+            num_result_ids=self.num_images,
+            deduplicate=True,
+        )
+        feat_sims = defaultdict(list)
+        feat_scores = {}
+        for result in results:
+            feature_id = result["image_path"].split("/")[-2]
+            feat_sims[feature_id] += [result["similarity"]]
+        for fid, sims in feat_sims.items():
+            feat_scores[fid] = (sum(sims) / len(sims)) * (len(sims)/self.imgs_per_dir)
+        return dict(sorted(feat_scores.items(), key=lambda item: -item[1]))

scripts/collect_latents_dataset.py ADDED Viewed

	@@ -0,0 +1,96 @@

+import os
+import sys
+import io
+import tarfile
+import torch
+import webdataset as wds
+import numpy as np
+from tqdm import tqdm
+sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
+from SDLens.hooked_sd_pipeline import HookedStableDiffusionXLPipeline
+import datetime
+from datasets import load_dataset
+from torch.utils.data import DataLoader
+import diffusers
+import fire
+def main(save_path, start_at=0, finish_at=30000, dataset_batch_size=50):
+    blocks_to_save = [
+        'unet.down_blocks.2.attentions.1',
+        'unet.mid_block.attentions.0',
+        'unet.up_blocks.0.attentions.0',
+        'unet.up_blocks.0.attentions.1',
+    ]
+    # Initialization
+    dataset = load_dataset("guangyil/laion-coco-aesthetic", split="train", columns=["caption"], streaming=True).shuffle(seed=42)
+    pipe = HookedStableDiffusionXLPipeline.from_pretrained('stabilityai/sdxl-turbo')
+    pipe.to('cuda')
+    pipe.set_progress_bar_config(disable=True)
+    dataloader = DataLoader(dataset, batch_size=dataset_batch_size)
+    ct = datetime.datetime.now()
+    save_path = os.path.join(save_path, str(ct))
+    # Collecting dataset
+    os.makedirs(save_path, exist_ok=True)
+    writers = {
+        block: wds.TarWriter(f'{save_path}/{block}.tar') for block in blocks_to_save
+    }
+    writers.update({'images': wds.TarWriter(f'{save_path}/images.tar')})
+    def to_kwargs(kwargs_to_save):
+        kwargs = kwargs_to_save.copy()
+        seed = kwargs['seed']
+        del kwargs['seed']
+        kwargs['generator'] = torch.Generator(device="cpu").manual_seed(num_document)
+        return kwargs
+    dataloader_iter = iter(dataloader)
+    for num_document, batch in tqdm(enumerate(dataloader)):
+        if num_document < start_at:
+            continue
+        if num_document >= finish_at:
+            break
+        kwargs_to_save = {
+            'prompt': batch['caption'],
+            'positions_to_cache': blocks_to_save,
+            'save_input': True,
+            'save_output': True,
+            'num_inference_steps': 1,
+            'guidance_scale': 0.0,
+            'seed': num_document,
+            'output_type': 'pil'
+        }
+        kwargs = to_kwargs(kwargs_to_save)
+        output, cache = pipe.run_with_cache(
+            **kwargs
+        )
+        blocks = cache['input'].keys()
+        for block in blocks:
+            sample = {
+                "__key__": f"sample_{num_document}",
+                "output.pth": cache['output'][block],
+                "diff.pth": cache['output'][block] - cache['input'][block],
+                "gen_args.json": kwargs_to_save
+            }
+            writers[block].write(sample)
+        writers['images'].write({
+            "__key__": f"sample_{num_document}",
+            "images.npy": np.stack(output.images)
+        })
+    for block, writer in writers.items():
+        writer.close()
+if __name__ == '__main__':
+    fire.Fire(main)

scripts/train_sae.py ADDED Viewed

	@@ -0,0 +1,308 @@

+'''
+Adapted from
+https://github.com/openai/sparse_autoencoder/blob/main/sparse_autoencoder/train.py
+'''
+import os
+import sys
+sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
+from typing import Callable, Iterable, Iterator
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.distributed import ReduceOp
+from SAE.dataset_iterator import ActivationsDataloader
+from SAE.sae import SparseAutoencoder, unit_norm_decoder_, unit_norm_decoder_grad_adjustment_
+from SAE.sae_utils import SAETrainingConfig, Config
+from types import SimpleNamespace
+from typing import Optional, List
+import json
+import tqdm
+def weighted_average(points: torch.Tensor, weights: torch.Tensor):
+    weights = weights / weights.sum()
+    return (points * weights.view(-1, 1)).sum(dim=0)
+@torch.no_grad()
+def geometric_median_objective(
+    median: torch.Tensor, points: torch.Tensor, weights: torch.Tensor
+) -> torch.Tensor:
+    norms = torch.linalg.norm(points - median.view(1, -1), dim=1)  # type: ignore
+    return (norms * weights).sum()
+def compute_geometric_median(
+    points: torch.Tensor,
+    weights: Optional[torch.Tensor] = None,
+    eps: float = 1e-6,
+    maxiter: int = 100,
+    ftol: float = 1e-20,
+    do_log: bool = False,
+):
+    """
+    :param points: ``torch.Tensor`` of shape ``(n, d)``
+    :param weights: Optional ``torch.Tensor`` of shape :math:``(n,)``.
+    :param eps: Smallest allowed value of denominator, to avoid divide by zero.
+        Equivalently, this is a smoothing parameter. Default 1e-6.
+    :param maxiter: Maximum number of Weiszfeld iterations. Default 100
+    :param ftol: If objective value does not improve by at least this `ftol` fraction, terminate the algorithm. Default 1e-20.
+    :param do_log: If true will return a log of function values encountered through the course of the algorithm
+    :return: SimpleNamespace object with fields
+        - `median`: estimate of the geometric median, which is a ``torch.Tensor`` object of shape :math:``(d,)``
+        - `termination`: string explaining how the algorithm terminated.
+        - `logs`: function values encountered through the course of the algorithm in a list (None if do_log is false).
+    """
+    with torch.no_grad():
+        if weights is None:
+            weights = torch.ones((points.shape[0],), device=points.device)
+        # initialize median estimate at mean
+        new_weights = weights
+        median = weighted_average(points, weights)
+        objective_value = geometric_median_objective(median, points, weights)
+        if do_log:
+            logs = [objective_value]
+        else:
+            logs = None
+        # Weiszfeld iterations
+        early_termination = False
+        pbar = tqdm.tqdm(range(maxiter))
+        for _ in pbar:
+            prev_obj_value = objective_value
+            norms = torch.linalg.norm(points - median.view(1, -1), dim=1)  # type: ignore
+            new_weights = weights / torch.clamp(norms, min=eps)
+            median = weighted_average(points, new_weights)
+            objective_value = geometric_median_objective(median, points, weights)
+            if logs is not None:
+                logs.append(objective_value)
+            if abs(prev_obj_value - objective_value) <= ftol * objective_value:
+                early_termination = True
+                break
+            pbar.set_description(f"Objective value: {objective_value:.4f}")
+    median = weighted_average(points, new_weights)  # allow autodiff to track it
+    return SimpleNamespace(
+        median=median,
+        new_weights=new_weights,
+        termination=(
+            "function value converged within tolerance"
+            if early_termination
+            else "maximum iterations reached"
+        ),
+        logs=logs,
+    )
+def maybe_transpose(x):
+    return x.T if not x.is_contiguous() and x.T.is_contiguous() else x
+import wandb
+RANK = 0
+class Logger:
+    def __init__(self, sae_name, **kws):
+        self.vals = {}
+        self.enabled = (RANK == 0) and not kws.pop("dummy", False)
+        self.sae_name = sae_name
+    def logkv(self, k, v):
+        if self.enabled:
+            self.vals[f'{self.sae_name}/{k}'] = v.detach() if isinstance(v, torch.Tensor) else v
+        return v
+    def dumpkvs(self, step):
+        if self.enabled:
+            wandb.log(self.vals, step=step)
+            self.vals = {}
+class FeaturesStats:
+    def __init__(self, dim, logger):
+        self.dim = dim
+        self.logger = logger
+        self.reinit()
+    def reinit(self):
+        self.n_activated = torch.zeros(self.dim, dtype=torch.long, device="cuda")
+        self.n = 0
+    def update(self, inds):
+        self.n += inds.shape[0]
+        inds = inds.flatten().detach()
+        self.n_activated.scatter_add_(0, inds, torch.ones_like(inds))
+    def log(self):
+        self.logger.logkv('activated', (self.n_activated / self.n + 1e-9).log10().cpu().numpy())
+def training_loop_(
+    aes,
+    train_acts_iter,
+    loss_fn,
+    log_interval,
+    save_interval,
+    loggers,
+    sae_cfgs,
+):
+    sae_packs = []
+    for ae, cfg, logger in zip(aes, sae_cfgs, loggers):
+        pbar = tqdm.tqdm(unit=" steps", desc="Training Loss: ")
+        fstats = FeaturesStats(ae.n_dirs, logger)
+        opt = torch.optim.Adam(ae.parameters(), lr=cfg.lr, eps=cfg.eps, fused=True)
+        sae_packs.append((ae, cfg, logger, pbar, fstats, opt))
+    for i, flat_acts_train_batch in enumerate(train_acts_iter):
+        flat_acts_train_batch = flat_acts_train_batch.cuda()
+        for ae, cfg, logger, pbar, fstats, opt in sae_packs:
+            recons, info = ae(flat_acts_train_batch)
+            loss = loss_fn(ae, cfg, flat_acts_train_batch, recons, info, logger)
+            fstats.update(info['inds'])
+            bs = flat_acts_train_batch.shape[0]
+            logger.logkv('not-activated 1e4', (ae.stats_last_nonzero > 1e4 / bs).mean(dtype=float).item())
+            logger.logkv('not-activated 1e6', (ae.stats_last_nonzero > 1e6 / bs).mean(dtype=float).item())
+            logger.logkv('not-activated 1e7', (ae.stats_last_nonzero > 1e7 / bs).mean(dtype=float).item())
+            logger.logkv('explained variance', explained_variance(recons, flat_acts_train_batch))
+            logger.logkv('l2_div', (torch.linalg.norm(recons, dim=1) / torch.linalg.norm(flat_acts_train_batch, dim=1)).mean())
+            if (i + 1) % log_interval == 0:
+                fstats.log()
+                fstats.reinit()
+            if (i + 1) % save_interval == 0:
+                ae.save_to_disk(f"{cfg.save_path}/{i + 1}")
+            loss.backward()
+            unit_norm_decoder_(ae)
+            unit_norm_decoder_grad_adjustment_(ae)
+            opt.step()
+            opt.zero_grad()
+            logger.dumpkvs(i)
+            pbar.set_description(f"Training Loss {loss.item():.4f}")
+            pbar.update(1)
+    for ae, cfg, logger, pbar, fstats, opt in sae_packs:
+        pbar.close()
+        ae.save_to_disk(f"{cfg.save_path}/final")
+def init_from_data_(ae, stats_acts_sample):
+    ae.pre_bias.data = (
+        compute_geometric_median(stats_acts_sample[:32768].float().cpu()).median.cuda().float()
+    )
+def mse(recons, x):
+    # return ((recons - x) ** 2).sum(dim=-1).mean()
+    return ((recons - x) ** 2).mean()
+def normalized_mse(recon: torch.Tensor, xs: torch.Tensor) -> torch.Tensor:
+    # only used for auxk
+    xs_mu = xs.mean(dim=0)
+    loss = mse(recon, xs) / mse(
+        xs_mu[None, :].broadcast_to(xs.shape), xs
+    )
+    return loss
+def explained_variance(recons, x):
+    # Compute the variance of the difference
+    diff = x - recons
+    diff_var = torch.var(diff, dim=0, unbiased=False)
+    # Compute the variance of the original tensor
+    x_var = torch.var(x, dim=0, unbiased=False)
+    # Avoid division by zero
+    explained_var = 1 - diff_var / (x_var + 1e-8)
+    return explained_var.mean()
+def main():
+    cfg = Config(json.load(open('SAE/config.json')))
+    dataloader = ActivationsDataloader(cfg.paths_to_latents, cfg.block_name, cfg.bs)
+    acts_iter = dataloader.iterate()
+    stats_acts_sample = torch.cat([
+        next(acts_iter).cpu() for _ in range(10)
+    ], dim=0)
+    aes = [
+        SparseAutoencoder(
+            n_dirs_local=sae.n_dirs,
+            d_model=sae.d_model,
+            k=sae.k,
+            auxk=sae.auxk,
+            dead_steps_threshold=sae.dead_toks_threshold // cfg.bs,
+        ).cuda()
+        for sae in cfg.saes
+    ]
+    for ae in aes:
+        init_from_data_(ae, stats_acts_sample)
+    mse_scale = (
+        1 / ((stats_acts_sample.float().mean(dim=0) - stats_acts_sample.float()) ** 2).mean()
+    )
+    mse_scale = mse_scale.item()
+    del stats_acts_sample
+    wandb.init(
+        project=cfg.wandb_project,
+        name=cfg.wandb_name,
+    )
+    loggers = [Logger(
+        sae_name=cfg_sae.sae_name,
+        dummy=False,
+    ) for cfg_sae in cfg.saes]
+    training_loop_(
+        aes,
+        acts_iter,
+        lambda ae, cfg_sae, flat_acts_train_batch, recons, info, logger: (
+            # MSE
+            logger.logkv("train_recons", mse_scale * mse(recons, flat_acts_train_batch))
+            # AuxK
+            + logger.logkv(
+                "train_maxk_recons",
+                cfg_sae.auxk_coef
+                * normalized_mse(
+                    ae.decode_sparse(
+                        info["auxk_inds"],
+                        info["auxk_vals"],
+                    ),
+                    flat_acts_train_batch - recons.detach() + ae.pre_bias.detach(),
+                ).nan_to_num(0),
+            )
+        ),
+        sae_cfgs = cfg.saes,
+        loggers=loggers,
+        log_interval=cfg.log_interval,
+        save_interval=cfg.save_interval,
+    )
+if __name__ == "__main__":
+    main()

utils/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .hooks import *

utils/hooks.py ADDED Viewed

	@@ -0,0 +1,145 @@

+from typing import Callable, List, Optional
+import torch
+class TimedHook:
+    def __init__(self, hook_fn, total_steps, apply_at_steps=None):
+        self.hook_fn = hook_fn
+        self.total_steps = total_steps
+        self.apply_at_steps = apply_at_steps
+        self.current_step = 0
+    def identity(self, module, input, output):
+        return output
+    def __call__(self, module, input, output):
+        if self.apply_at_steps is not None:
+            if self.current_step in self.apply_at_steps:
+                self.__increment()
+                return self.hook_fn(module, input, output)
+            else:
+                self.__increment()
+                return self.identity(module, input, output)
+        return self.identity(module, input, output)
+    def __increment(self):
+        if self.current_step < self.total_steps:
+            self.current_step += 1
+        else:
+            self.current_step = 0
+@torch.no_grad()
+def add_feature(sae, feature_idx, value, module, input, output):
+    diff = (output[0] - input[0]).permute((0, 2, 3, 1)).to(sae.device)
+    activated = sae.encode(diff)
+    mask = torch.zeros_like(activated, device=diff.device)
+    mask[..., feature_idx] = value
+    to_add = mask @ sae.decoder.weight.T
+    return (output[0] + to_add.permute(0, 3, 1, 2).to(output[0].device),)
+@torch.no_grad()
+def add_feature_on_area_base(sae, feature_idx, activation_map, module, input, output):
+    return add_feature_on_area_base_both(sae, feature_idx, activation_map, module, input, output)
+@torch.no_grad()
+def add_feature_on_area_base_both(sae, feature_idx, activation_map, module, input, output):
+    # add the feature to cond and subtract from uncond
+    # this assumes diff.shape[0] == 2
+    diff = (output[0] - input[0]).permute((0, 2, 3, 1)).to(sae.device)
+    activated = sae.encode(diff)
+    mask = torch.zeros_like(activated, device=diff.device)
+    if len(activation_map) == 2:
+        activation_map = activation_map.unsqueeze(0)
+    mask[..., feature_idx] = mask[..., feature_idx] = activation_map.to(mask.device)
+    to_add = mask @ sae.decoder.weight.T
+    to_add = to_add.chunk(2)
+    output[0][0] -= to_add[0].permute(0, 3, 1, 2).to(output[0].device)[0]
+    output[0][1] += to_add[1].permute(0, 3, 1, 2).to(output[0].device)[0]
+    return output
+@torch.no_grad()
+def add_feature_on_area_base_cond(sae, feature_idx, activation_map, module, input, output):
+    # add the feature to cond
+    # this assumes diff.shape[0] == 2
+    diff = (output[0] - input[0]).permute((0, 2, 3, 1)).to(sae.device)
+    diff_uncond, diff_cond = diff.chunk(2)
+    activated = sae.encode(diff_cond)
+    mask = torch.zeros_like(activated, device=diff_cond.device)
+    if len(activation_map) == 2:
+        activation_map = activation_map.unsqueeze(0)
+    mask[..., feature_idx] = mask[..., feature_idx] = activation_map.to(mask.device)
+    to_add = mask @ sae.decoder.weight.T
+    output[0][1] += to_add.permute(0, 3, 1, 2).to(output[0].device)[0]
+    return output
+@torch.no_grad()
+def replace_with_feature_base(sae, feature_idx, value, module, input, output):
+    # this assumes diff.shape[0] == 2
+    diff = (output[0] - input[0]).permute((0, 2, 3, 1)).to(sae.device)
+    diff_uncond, diff_cond = diff.chunk(2)
+    activated = sae.encode(diff_cond)
+    mask = torch.zeros_like(activated, device=diff_cond.device)
+    mask[..., feature_idx] = value
+    to_add = mask @ sae.decoder.weight.T
+    input[0][1] += to_add.permute(0, 3, 1, 2).to(output[0].device)[0]
+    return input
+@torch.no_grad()
+def add_feature_on_area_turbo(sae, feature_idx, activation_map, module, input, output):
+    diff = (output[0] - input[0]).permute((0, 2, 3, 1)).to(sae.device)
+    activated = sae.encode(diff)
+    mask = torch.zeros_like(activated, device=diff.device)
+    if len(activation_map) == 2:
+        activation_map = activation_map.unsqueeze(0)
+    mask[..., feature_idx] = mask[..., feature_idx] = activation_map.to(mask.device)
+    to_add = mask @ sae.decoder.weight.T
+    return (output[0] + to_add.permute(0, 3, 1, 2).to(output[0].device),)
+@torch.no_grad
+def add_feature_on_area_flux(
+    sae,
+    feature_idx,
+    activation_map,
+    module,
+    input: torch.Tensor,
+    output: torch.Tensor,
+    ):
+    diff = (output - input).to(sae.device)
+    activated = sae.encode(diff)
+    # TODO: check
+    if len(activation_map) == 2:
+        activation_map = activation_map.unsqueeze(0)
+    mask = torch.zeros_like(activated, device=diff.device)
+    activation_map = activation_map.flatten()
+    mask[..., feature_idx] = activation_map.to(mask.device)
+    to_add = mask @ sae.decoder.weight.T
+    return output + to_add.to(output.device, output.dtype)
+@torch.no_grad()
+def replace_with_feature_turbo(sae, feature_idx, value, module, input, output):
+    diff = (output[0] - input[0]).permute((0, 2, 3, 1)).to(sae.device)
+    activated = sae.encode(diff)
+    mask = torch.zeros_like(activated, device=diff.device)
+    mask[..., feature_idx] = value
+    to_add = mask @ sae.decoder.weight.T
+    return (input[0] + to_add.permute(0, 3, 1, 2).to(output[0].device),)
+@torch.no_grad()
+def reconstruct_sae_hook(sae, module, input, output):
+    diff = (output[0] - input[0]).permute((0, 2, 3, 1)).to(sae.device)
+    activated = sae.encode(diff)
+    reconstructed = sae.decoder(activated) + sae.pre_bias
+    return (input[0] + reconstructed.permute(0, 3, 1, 2).to(output[0].device),)
+@torch.no_grad()
+def ablate_block(module, input, output):
+    return input