Spaces:

LL3RD
/

DreamFuse

Running on Zero

App Files Files Community

LL3RD commited on Apr 11

Commit

f96f677

0 Parent(s):

test

Browse files

Files changed (16) hide show

.gitattributes +36 -0
README.md +13 -0
__pycache__/dreamfuse_inference.cpython-310.pyc +0 -0
app.py +491 -0
dreamfuse/.DS_Store +0 -0
dreamfuse/models/dreamfuse_flux/__pycache__/flux_processor.cpython-310.pyc +0 -0
dreamfuse/models/dreamfuse_flux/__pycache__/transformer.cpython-310.pyc +0 -0
dreamfuse/models/dreamfuse_flux/flux_processor.py +269 -0
dreamfuse/models/dreamfuse_flux/transformer.py +866 -0
dreamfuse/trains/utils/__pycache__/inference_utils.cpython-310.pyc +0 -0
dreamfuse/trains/utils/inference_utils.py +386 -0
dreamfuse_inference.py +642 -0
examples/9_01.png +3 -0
examples/9_02.png +3 -0
output_images/no_bg_image.png +3 -0
requirements.txt +37 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,36 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,13 @@

+---
+title: DreamFuse
+emoji: 📚
+colorFrom: indigo
+colorTo: blue
+sdk: gradio
+sdk_version: 5.24.0
+app_file: app.py
+pinned: false
+license: mit
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

__pycache__/dreamfuse_inference.cpython-310.pyc ADDED Viewed

Binary file (14.1 kB). View file

app.py ADDED Viewed

	@@ -0,0 +1,491 @@

+import gradio as gr
+import spaces
+from PIL import Image, ImageDraw, ImageOps
+import base64, json
+from io import BytesIO
+import torch.nn.functional as F
+import json
+from typing import List
+from dataclasses import dataclass, field
+from dreamfuse_inference import DreamFuseInference, InferenceConfig
+import numpy as np
+import os
+from transformers import AutoModelForImageSegmentation
+from torchvision import transforms
+import torch
+import subprocess
+subprocess.run("rm -rf /data-nvme/zerogpu-offload/*", env={}, shell=True)
+generated_images = []
+RMBG_model = AutoModelForImageSegmentation.from_pretrained('briaai/RMBG-2.0', trust_remote_code=True)
+RMBG_model = RMBG_model.to("cuda")
+transform = transforms.Compose([
+    transforms.Resize((1024, 1024)),
+    transforms.ToTensor(),
+    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
+])
+@spaces.GPU
+def remove_bg(image):
+    im = image.convert("RGB")
+    input_tensor = transform(im).unsqueeze(0).to("cuda")
+    with torch.no_grad():
+        preds = RMBG_model(input_tensor)[-1].sigmoid().cpu()[0].squeeze()
+    mask = transforms.ToPILImage()(preds).resize(im.size)
+    return mask
+class DreamblendGUI:
+    def __init__(self):
+        self.examples = [
+            ["./examples/9_02.png",
+            "./examples/9_01.png"],
+        ]
+        self.examples = [[Image.open(x) for x in example] for example in self.examples]
+        self.css_style = self._get_css_style()
+        self.js_script = self._get_js_script()
+    def _get_css_style(self):
+        return """
+        body {
+          background: transparent;
+          font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
+          color: #fff;
+        }
+        .gradio-container {
+          max-width: 1200px;
+          margin: auto;
+          background: transparent;
+          border-radius: 10px;
+          padding: 20px;
+          box-shadow: 0px 2px 8px rgba(255,255,255,0.1);
+        }
+        h1, h2 {
+          text-align: center;
+          color: #fff;
+        }
+        #canvas_preview {
+          border: 2px dashed rgba(255,255,255,0.5);
+          padding: 10px;
+          background: transparent;
+          border-radius: 8px;
+        }
+        .gr-button {
+          background-color: #007bff;
+          border: none;
+          color: #fff;
+          padding: 10px 20px;
+          border-radius: 5px;
+          font-size: 16px;
+          cursor: pointer;
+        }
+        .gr-button:hover {
+          background-color: #0056b3;
+        }
+        #small-examples {
+          max-width: 200px !important;
+          width: 200px !important;
+          float: left;
+          margin-right: 20px;
+        }
+        """
+    def _get_js_script(self):
+        return r"""
+        async () => {
+            window.updateTransformation = function() {
+                const img = document.getElementById('draggable-img');
+                const container = document.getElementById('canvas-container');
+                if (!img || !container) return;
+                const left = parseFloat(img.style.left) || 0;
+                const top = parseFloat(img.style.top) || 0;
+                const canvasSize = 400;
+                const data_original_width = parseFloat(img.getAttribute('data-original-width'));
+                const data_original_height = parseFloat(img.getAttribute('data-original-height'));
+                const bgWidth = parseFloat(container.dataset.bgWidth);
+                const bgHeight = parseFloat(container.dataset.bgHeight);
+                const scale_ratio = img.clientWidth / data_original_width;
+                const transformation = {
+                    drag_left: left,
+                    drag_top: top,
+                    drag_width: img.clientWidth,
+                    drag_height: img.clientHeight,
+                    data_original_width: data_original_width,
+                    data_original_height: data_original_height,
+                    scale_ratio: scale_ratio
+                };
+                const transInput = document.querySelector("#transformation_info textarea");
+                if(transInput){
+                   const newValue = JSON.stringify(transformation);
+                   const nativeSetter = Object.getOwnPropertyDescriptor(window.HTMLTextAreaElement.prototype, 'value').set;
+                   nativeSetter.call(transInput, newValue);
+                   transInput.dispatchEvent(new Event('input', { bubbles: true }));
+                   console.log("Transformation info updated: ", newValue);
+                } else {
+                   console.log("找不到 transformation_info 的 textarea 元素");
+                }
+            };
+            globalThis.initializeDrag = () => {
+                console.log("初始化拖拽与缩放功能...");
+                const observer = new MutationObserver(() => {
+                    const img = document.getElementById('draggable-img');
+                    const container = document.getElementById('canvas-container');
+                    const slider = document.getElementById('scale-slider');
+                    if (img && container && slider) {
+                        observer.disconnect();
+                        console.log("绑定拖拽与缩放事件...");
+                        img.ondragstart = (e) => { e.preventDefault(); return false; };
+                        let offsetX = 0, offsetY = 0;
+                        let isDragging = false;
+                        let scaleAnchor = null;
+                        img.addEventListener('mousedown', (e) => {
+                            isDragging = true;
+                            img.style.cursor = 'grabbing';
+                            const imgRect = img.getBoundingClientRect();
+                            offsetX = e.clientX - imgRect.left;
+                            offsetY = e.clientY - imgRect.top;
+                            img.style.transform = "none";
+                            img.style.left = img.offsetLeft + "px";
+                            img.style.top = img.offsetTop + "px";
+                            console.log("mousedown: left=", img.style.left, "top=", img.style.top);
+                        });
+                        document.addEventListener('mousemove', (e) => {
+                        if (!isDragging) return;
+                        e.preventDefault();
+                        const containerRect = container.getBoundingClientRect();
+                        // 计算当前拖拽后的坐标（基于容器）
+                        let left = e.clientX - containerRect.left - offsetX;
+                        let top  = e.clientY - containerRect.top  - offsetY;
+                        // 允许的拖拽范围：
+                        // 水平方向允许最少超出图像一半：最小值为 -img.clientWidth * (7/8)
+                        // 水平方向允许最多超出一半：最大值为 containerRect.width - img.clientWidth * (1/8)
+                        const minLeft = -img.clientWidth * (7/8);
+                        const maxLeft = containerRect.width - img.clientWidth * (1/8);
+                        // 垂直方向允许范围：
+                        // 最小值为 -img.clientHeight * (7/8)
+                        // 最大值为 containerRect.height - img.clientHeight * (1/8)
+                        const minTop = -img.clientHeight * (7/8);
+                        const maxTop = containerRect.height - img.clientHeight * (1/8);
+                        // 限制范围
+                        if (left < minLeft) left = minLeft;
+                        if (left > maxLeft) left = maxLeft;
+                        if (top < minTop) top = minTop;
+                        if (top > maxTop) top = maxTop;
+                        img.style.left = left + "px";
+                        img.style.top = top + "px";
+                        });
+                        window.addEventListener('mouseup', (e) => {
+                            if (isDragging) {
+                                isDragging = false;
+                                img.style.cursor = 'grab';
+                                const containerRect = container.getBoundingClientRect();
+                                const bgWidth = parseFloat(container.dataset.bgWidth);
+                                const bgHeight = parseFloat(container.dataset.bgHeight);
+                                const offsetLeft = (containerRect.width - bgWidth) / 2;
+                                const offsetTop = (containerRect.height - bgHeight) / 2;
+                                const absoluteLeft = parseFloat(img.style.left);
+                                const absoluteTop = parseFloat(img.style.top);
+                                const relativeX = absoluteLeft - offsetLeft;
+                                const relativeY = absoluteTop - offsetTop;
+                                document.getElementById("coordinate").textContent =
+                                    `前景图坐标: (x=${relativeX.toFixed(2)}, y=${relativeY.toFixed(2)})`;
+                                updateTransformation();
+                            }
+                            scaleAnchor = null;
+                        });
+                        slider.addEventListener('mousedown', (e) => {
+                            const containerRect = container.getBoundingClientRect();
+                            const imgRect = img.getBoundingClientRect();
+                            scaleAnchor = {
+                                x: imgRect.left + imgRect.width/2 - containerRect.left,
+                                y: imgRect.top + imgRect.height/2 - containerRect.top
+                            };
+                            console.log("Slider mousedown, captured scaleAnchor: ", scaleAnchor);
+                        });
+                        slider.addEventListener('input', (e) => {
+                            const scale = parseFloat(e.target.value);
+                            const originalWidth = parseFloat(img.getAttribute('data-original-width'));
+                            const originalHeight = parseFloat(img.getAttribute('data-original-height'));
+                            const newWidth = originalWidth * scale;
+                            const newHeight = originalHeight * scale;
+                            const containerRect = container.getBoundingClientRect();
+                            let centerX, centerY;
+                            if (scaleAnchor) {
+                                centerX = scaleAnchor.x;
+                                centerY = scaleAnchor.y;
+                            } else {
+                                const imgRect = img.getBoundingClientRect();
+                                centerX = imgRect.left + imgRect.width/2 - containerRect.left;
+                                centerY = imgRect.top + imgRect.height/2 - containerRect.top;
+                            }
+                            const newLeft = centerX - newWidth/2;
+                            const newTop = centerY - newHeight/2;
+                            img.style.width = newWidth + "px";
+                            img.style.height = newHeight + "px";
+                            img.style.left = newLeft + "px";
+                            img.style.top = newTop + "px";
+                            console.log("slider: scale=", scale, "newWidth=", newWidth, "newHeight=", newHeight);
+                            updateTransformation();
+                        });
+                        slider.addEventListener('mouseup', (e) => {
+                            scaleAnchor = null;
+                        });
+                    }
+                });
+                observer.observe(document.body, { childList: true, subtree: true });
+            };
+        }
+        """
+    def get_next_sequence(self, folder_path):
+        # 列出文件夹中的所有文件名
+        filenames = os.listdir(folder_path)
+        # 提取文件名中的序列号部分（假设是前三位数字）
+        sequences = [int(name.split('_')[0]) for name in filenames if name.split('_')[0].isdigit()]
+        # 找到最大序列号
+        max_sequence = max(sequences, default=-1)
+        # 返回下一位序列号，格式为三位数字（如002）
+        return f"{max_sequence + 1:03d}"
+    def pil_to_base64(self, img):
+        """将 PIL Image 转为 base64 字符串，PNG 格式下保留透明通道"""
+        if img is None:
+            return ""
+        if img.mode != "RGBA":
+            img = img.convert("RGBA")
+        buffered = BytesIO()
+        img.save(buffered, format="PNG", optimize=True)
+        img_bytes = buffered.getvalue()
+        base64_str = base64.b64encode(img_bytes).decode()
+        return f"data:image/png;base64,{base64_str}"
+    def resize_background_image(self, img, max_size=400):
+        """将背景图等比例缩放到最长边为 max_size（400）"""
+        if img is None:
+            return None
+        w, h = img.size
+        if w > max_size or h > max_size:
+            ratio = min(max_size / w, max_size / h)
+            new_w, new_h = int(w * ratio), int(h * ratio)
+            img = img.resize((new_w, new_h), Image.LANCZOS)
+        return img
+    def resize_draggable_image(self, img, max_size=400):
+        """将前景图等比例缩放到最长边不超过 max_size（400）"""
+        if img is None:
+            return None
+        w, h = img.size
+        if w > max_size or h > max_size:
+            ratio = min(max_size / w, max_size / h)
+            new_w, new_h = int(w * ratio), int(h * ratio)
+            img = img.resize((new_w, new_h), Image.LANCZOS)
+        return img
+    def generate_html(self, background_img_b64, bg_width, bg_height, draggable_img_b64, draggable_width, draggable_height, canvas_size=400):
+        """生成预览 HTML 页面"""
+        html_code = f"""
+        <html>
+        <head>
+            <style>
+                body {{
+                    margin: 0;
+                    padding: 0;
+                    text-align: center;
+                    font-family: sans-serif;
+                    background: transparent;
+                    color: #fff;
+                }}
+                h2 {{
+                    margin-top: 1rem;
+                }}
+                #scale-control {{
+                    margin: 1rem auto;
+                    width: 400px;
+                    text-align: left;
+                }}
+                #scale-control label {{
+                    font-size: 1rem;
+                    margin-right: 0.5rem;
+                }}
+                #canvas-container {{
+                    position: relative;
+                    width: {canvas_size}px;
+                    height: {canvas_size}px;
+                    margin: 0 auto;
+                    border: 1px dashed rgba(255,255,255,0.5);
+                    overflow: hidden;
+                    background-image: url('{background_img_b64}');
+                    background-repeat: no-repeat;
+                    background-position: center;
+                    background-size: contain;
+                    border-radius: 8px;
+                }}
+                #draggable-img {{
+                    position: absolute;
+                    cursor: grab;
+                    left: 50%;
+                    top: 50%;
+                    transform: translate(-50%, -50%);
+                    background-color: transparent;
+                }}
+                #coordinate {{
+                    color: #fff;
+                    margin-top: 1rem;
+                    font-weight: bold;
+                }}
+            </style>
+        </head>
+        <body>
+            <h2>拖拽前景图（支持缩放）</h2>
+            <div id="scale-control">
+                <label for="scale-slider">前景图缩放:</label>
+                <input type="range" id="scale-slider" min="0.1" max="2" step="0.01" value="1">
+            </div>
+            <div id="canvas-container" data-bg-width="{bg_width}" data-bg-height="{bg_height}">
+                <img id="draggable-img"
+                     src="{draggable_img_b64}"
+                     alt="Draggable Image"
+                     draggable="false"
+                     data-original-width="{draggable_width}"
+                     data-original-height="{draggable_height}"
+                />
+            </div>
+            <p id="coordinate">前景图坐标: (x=?, y=?)</p>
+        </body>
+        </html>
+        """
+        return html_code
+    def on_upload(self, background_img, draggable_img):
+        """上传图片后的处理"""
+        if background_img is None or draggable_img is None:
+            return "<p style='color:red;'>请先上传背景图片和可拖拽图片。</p>"
+        if draggable_img.mode != "RGB":
+            draggable_img = draggable_img.convert("RGB")
+        draggable_img_mask = remove_bg(draggable_img)
+        alpha_channel = draggable_img_mask.convert("L")
+        draggable_img = draggable_img.convert("RGBA")
+        draggable_img.putalpha(alpha_channel)
+        resized_bg = self.resize_background_image(background_img, max_size=400)
+        bg_w, bg_h = resized_bg.size
+        resized_fg = self.resize_draggable_image(draggable_img, max_size=400)
+        draggable_width, draggable_height = resized_fg.size
+        background_img_b64 = self.pil_to_base64(resized_bg)
+        draggable_img_b64 = self.pil_to_base64(resized_fg)
+        return self.generate_html(
+            background_img_b64, bg_w, bg_h,
+            draggable_img_b64, draggable_width, draggable_height,
+            canvas_size=400
+        ), draggable_img
+    def save_image(self, save_path = "/mnt/bn/hjj-humanseg-lq/SubjectDriven/DreamFuse/debug"):
+        global generated_images
+        save_name = self.get_next_sequence(save_path)
+        generated_images[0].save(os.path.join(save_path, f"{save_name}_0_ori.png"))
+        generated_images[1].save(os.path.join(save_path, f"{save_name}_0.png"))
+        generated_images[2].save(os.path.join(save_path, f"{save_name}_1.png"))
+        generated_images[3].save(os.path.join(save_path, f"{save_name}_2.png"))
+        generated_images[4].save(os.path.join(save_path, f"{save_name}_0_mask.png"))
+        generated_images[5].save(os.path.join(save_path, f"{save_name}_0_mask_scale.png"))
+        generated_images[6].save(os.path.join(save_path, f"{save_name}_0_scale.png"))
+        generated_images[7].save(os.path.join(save_path, f"{save_name}_2_pasted.png"))
+    def create_gui(self):
+        config = InferenceConfig()
+        config.lora_id = 'LL3RD/DreamFuse'
+        pipeline = DreamFuseInference(config)
+        pipeline.gradio_generate = spaces.GPU(duratioin=120)(pipeline.gradio_generate)
+        """创建 Gradio 界面"""
+        with gr.Blocks(css=self.css_style) as demo:
+            modified_fg_state = gr.State()
+            gr.Markdown("# Dreamblend-GUI-dirtydata")
+            gr.Markdown("通过上传背景图与前景图生成带有可拖拽/缩放预览的合成图像，同时支持 Seed 设置和 Prompt 文本输入。")
+            with gr.Row():
+                with gr.Column(scale=1):
+                    gr.Markdown("### 上传图片")
+                    background_img_in = gr.Image(label="背景图片", type="pil", height=240, width=240)
+                    draggable_img_in = gr.Image(label="前景图片", type="pil", image_mode="RGBA", height=240, width=240)
+                    generate_btn = gr.Button("生成可拖拽画布")
+                    with gr.Row():
+                        gr.Examples(
+                            examples=[self.examples[0]],
+                            inputs=[background_img_in, draggable_img_in],
+                            elem_id="small-examples"
+                        )
+                with gr.Column(scale=1):
+                    gr.Markdown("### 预览区域")
+                    html_out = gr.HTML(label="预览与拖拽", elem_id="canvas_preview")
+            with gr.Row():
+                with gr.Column(scale=1):
+                    gr.Markdown("### 参数设置")
+                    seed_slider = gr.Slider(minimum=0, maximum=10000, step=1, label="Seed", value=42)
+                    cfg_slider = gr.Slider(minimum=1, maximum=10, step=0.1, label="CFG", value=3.5)
+                    size_select = gr.Radio(
+                        choices=["512", "768", "1024"],
+                        value="512",
+                        label="生成质量(512-差 1024-好)",
+                    )
+                    prompt_text = gr.Textbox(label="Prompt", placeholder="输入文本提示", value="")
+                    text_strength = gr.Slider(minimum=1, maximum=10, step=1, label="Text Strength", value=1)
+                    enable_gui = gr.Checkbox(label="启用GUI", value=True)
+                    enable_truecfg = gr.Checkbox(label="启用TrueCFG", value=False)
+                    enable_save = gr.Button("保存图片 (内部测试)", visible=True)
+                with gr.Column(scale=1):
+                    gr.Markdown("### 模型生成结果")
+                    model_generate_btn = gr.Button("模型生成")
+                    transformation_text = gr.Textbox(label="Transformation Info", elem_id="transformation_info", visible=False)
+                    model_output = gr.Image(label="模型输出", type="pil")
+            # 交互事件绑定
+            enable_save.click(fn=self.save_image, inputs=None, outputs=None)
+            generate_btn.click(
+                fn=self.on_upload,
+                inputs=[background_img_in, draggable_img_in],
+                outputs=[html_out, modified_fg_state],
+            )
+            model_generate_btn.click(
+                fn=pipeline.gradio_generate,
+                inputs=[background_img_in, modified_fg_state, transformation_text, seed_slider, \
+                    prompt_text, enable_gui, cfg_slider, size_select, text_strength, enable_truecfg],
+                outputs=model_output
+            )
+            # 页面加载后初始化拖拽/缩放事件
+            demo.load(None, None, None, js=self.js_script)
+            generate_btn.click(fn=None, inputs=None, outputs=None, js="initializeDrag")
+        return demo
+if __name__ == "__main__":
+    gui = DreamblendGUI()
+    demo = gui.create_gui()
+    demo.queue()
+    demo.launch()
+    # demo.launch(server_port=7789, ssr_mode=False)
+    # demo.launch(server_name="[::]", share=True)

dreamfuse/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

dreamfuse/models/dreamfuse_flux/__pycache__/flux_processor.cpython-310.pyc ADDED Viewed

Binary file (7.61 kB). View file

dreamfuse/models/dreamfuse_flux/__pycache__/transformer.cpython-310.pyc ADDED Viewed

Binary file (23.9 kB). View file

dreamfuse/models/dreamfuse_flux/flux_processor.py ADDED Viewed

	@@ -0,0 +1,269 @@

+import inspect
+import math
+from typing import Callable, List, Optional, Tuple, Union
+import torch
+import torch.nn.functional as F
+from torch import nn
+from diffusers.image_processor import IPAdapterMaskProcessor
+from diffusers.utils import deprecate, logging
+from diffusers.utils.import_utils import is_torch_npu_available, is_xformers_available
+from diffusers.utils.torch_utils import is_torch_version, maybe_allow_in_graph
+from diffusers.models.attention import Attention
+from diffusers.models.embeddings import Timesteps, TimestepEmbedding, PixArtAlphaTextProjection
+class CombinedTimestepGuidanceTextProjEmbeddings(nn.Module):
+    def __init__(self, embedding_dim, pooled_projection_dim):
+        super().__init__()
+        self.time_proj = Timesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=0)
+        self.timestep_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=embedding_dim)
+        self.guidance_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=embedding_dim)
+        self.text_embedder = PixArtAlphaTextProjection(pooled_projection_dim, embedding_dim, act_fn="silu")
+    def forward(self, timestep, guidance, pooled_projection):
+        timesteps_proj = self.time_proj(timestep)
+        timesteps_emb = self.timestep_embedder(timesteps_proj.to(dtype=pooled_projection.dtype))  # (N, D)
+        if (guidance >= 0).all():
+            guidance_proj = self.time_proj(guidance)
+            guidance_emb = self.guidance_embedder(guidance_proj.to(dtype=pooled_projection.dtype))  # (N, D)
+            time_guidance_emb = timesteps_emb + guidance_emb
+            pooled_projections = self.text_embedder(pooled_projection)
+            conditioning = time_guidance_emb + pooled_projections
+        else:
+            pooled_projections = self.text_embedder(pooled_projection)
+            conditioning = timesteps_emb + pooled_projections
+        return conditioning
+def apply_rotary_emb(
+    x: torch.Tensor,
+    freqs_cis: Union[torch.Tensor, Tuple[torch.Tensor]],
+    use_real: bool = True,
+    use_real_unbind_dim: int = -1,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Apply rotary embeddings to input tensors using the given frequency tensor. This function applies rotary embeddings
+    to the given query or key 'x' tensors using the provided frequency tensor 'freqs_cis'. The input tensors are
+    reshaped as complex numbers, and the frequency tensor is reshaped for broadcasting compatibility. The resulting
+    tensors contain rotary embeddings and are returned as real tensors.
+    Args:
+        x (`torch.Tensor`):
+            Query or key tensor to apply rotary embeddings. [B, H, S, D] xk (torch.Tensor): Key tensor to apply
+        freqs_cis (`Tuple[torch.Tensor]`): Precomputed frequency tensor for complex exponentials. ([S, D], [S, D],)
+    Returns:
+        Tuple[torch.Tensor, torch.Tensor]: Tuple of modified query tensor and key tensor with rotary embeddings.
+    """
+    if use_real:
+        cos, sin = freqs_cis  # [S, D]
+        if cos.ndim == 2:
+            cos = cos[None, None]
+        else:
+            cos = cos.unsqueeze(1)
+        if sin.ndim == 2:
+            sin = sin[None, None]
+        else:
+            sin = sin.unsqueeze(1)
+        cos, sin = cos.to(x.device), sin.to(x.device)
+        if use_real_unbind_dim == -1:
+            # Used for flux, cogvideox, hunyuan-dit
+            x_real, x_imag = x.reshape(*x.shape[:-1], -1, 2).unbind(-1)  # [B, S, H, D//2]
+            x_rotated = torch.stack([-x_imag, x_real], dim=-1).flatten(3)
+        elif use_real_unbind_dim == -2:
+            # Used for Stable Audio
+            x_real, x_imag = x.reshape(*x.shape[:-1], 2, -1).unbind(-2)  # [B, S, H, D//2]
+            x_rotated = torch.cat([-x_imag, x_real], dim=-1)
+        else:
+            raise ValueError(f"`use_real_unbind_dim={use_real_unbind_dim}` but should be -1 or -2.")
+        out = (x.float() * cos + x_rotated.float() * sin).to(x.dtype)
+        return out
+    else:
+        # used for lumina
+        x_rotated = torch.view_as_complex(x.float().reshape(*x.shape[:-1], -1, 2))
+        freqs_cis = freqs_cis.unsqueeze(2)
+        x_out = torch.view_as_real(x_rotated * freqs_cis).flatten(3)
+        return x_out.type_as(x)
+class FluxAttnSharedProcessor2_0:
+    """Attention processor used typically in processing the SD3-like self-attention projections."""
+    def __init__(self):
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError("FluxAttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
+    def __call__(
+            self,
+            attn: Attention,
+            hidden_states: torch.FloatTensor,
+            encoder_hidden_states: torch.FloatTensor = None,
+            attention_mask: Optional[torch.FloatTensor] = None,
+            image_rotary_emb: Optional[torch.Tensor] = None,
+            data_num_per_group: Optional[int] = 1,
+            max_sequence_length: Optional[int] = 512,
+            mix_attention: bool = True,
+            cond_latents = None,
+            cond_image_rotary_emb = None,
+            work_mode = None,
+            mask_cond = None,
+    ) -> torch.FloatTensor:
+        with_cond = cond_latents is not None and mix_attention
+        batch_size, _, _ = hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        # `sample` projections.
+        query = attn.to_q(hidden_states)
+        key = attn.to_k(hidden_states)
+        value = attn.to_v(hidden_states)
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        if attn.norm_q is not None:
+            query = attn.norm_q(query)
+        if attn.norm_k is not None:
+            key = attn.norm_k(key)
+        # the attention in FluxSingleTransformerBlock does not use `encoder_hidden_states`
+        if encoder_hidden_states is not None:
+            # `context` projections.
+            encoder_hidden_states_query_proj = attn.add_q_proj(encoder_hidden_states)
+            encoder_hidden_states_key_proj = attn.add_k_proj(encoder_hidden_states)
+            encoder_hidden_states_value_proj = attn.add_v_proj(encoder_hidden_states)
+            encoder_hidden_states_query_proj = encoder_hidden_states_query_proj.view(
+                batch_size, -1, attn.heads, head_dim
+            ).transpose(1, 2)
+            encoder_hidden_states_key_proj = encoder_hidden_states_key_proj.view(
+                batch_size, -1, attn.heads, head_dim
+            ).transpose(1, 2)
+            encoder_hidden_states_value_proj = encoder_hidden_states_value_proj.view(
+                batch_size, -1, attn.heads, head_dim
+            ).transpose(1, 2)
+            if attn.norm_added_q is not None:
+                encoder_hidden_states_query_proj = attn.norm_added_q(encoder_hidden_states_query_proj)
+            if attn.norm_added_k is not None:
+                encoder_hidden_states_key_proj = attn.norm_added_k(encoder_hidden_states_key_proj)
+            # attention
+            query = torch.cat([encoder_hidden_states_query_proj, query], dim=2)
+            key = torch.cat([encoder_hidden_states_key_proj, key], dim=2)
+            value = torch.cat([encoder_hidden_states_value_proj, value], dim=2)
+        if image_rotary_emb is not None:
+            query = apply_rotary_emb(query, image_rotary_emb)
+            key = apply_rotary_emb(key, image_rotary_emb)
+        if with_cond:
+            cond_bs = cond_latents.shape[0]
+            # update condition
+            cond_query = attn.to_q(cond_latents)
+            cond_query = cond_query.view(cond_bs, -1, attn.heads, head_dim).transpose(1, 2)
+            if attn.norm_q is not None:
+                cond_query = attn.norm_q(cond_query)
+            cond_query = apply_rotary_emb(cond_query, cond_image_rotary_emb)
+            cond_query = torch.cat(cond_query.chunk(len(cond_query), dim=0), dim=2)
+            cond_key = attn.to_k(cond_latents)
+            cond_value = attn.to_v(cond_latents)
+            cond_key = cond_key.view(cond_bs, -1, attn.heads, head_dim).transpose(1, 2)
+            cond_value = cond_value.view(cond_bs, -1, attn.heads, head_dim).transpose(1, 2)
+            if attn.norm_k is not None:
+                cond_key = attn.norm_k(cond_key)
+            cond_key = apply_rotary_emb(cond_key, cond_image_rotary_emb)
+            cond_key = torch.cat(cond_key.chunk(len(cond_key), dim=0), dim=2)
+            cond_value = torch.cat(cond_value.chunk(len(cond_value), dim=0), dim=2)
+        if data_num_per_group > 1 and mix_attention:
+            E = max_sequence_length  # according to text len
+            key_enc, key_hid = key[:, :, :E], key[:, :, E:]
+            value_enc, value_hid = value[:, :, :E], value[:, :, E:]
+            key_layer = key_hid.chunk(data_num_per_group, dim=0)
+            key_layer = torch.cat(key_layer, dim=2).repeat(data_num_per_group, 1, 1, 1)
+            value_layer = value_hid.chunk(data_num_per_group, dim=0)
+            value_layer = torch.cat(value_layer, dim=2).repeat(data_num_per_group, 1, 1, 1)
+            key = torch.cat([key_enc, key_layer], dim=2)
+            value = torch.cat([value_enc, value_layer], dim=2)
+        elif data_num_per_group == 1 and mix_attention and with_cond:
+            E = max_sequence_length  # according to text len
+            key_enc, key_hid = key[:, :, :E], key[:, :, E:]
+            value_enc, value_hid = value[:, :, :E], value[:, :, E:]
+            # todo: support bs != 1
+            key_layer = torch.cat([key_hid, cond_key], dim=2)
+            value_layer = torch.cat([value_hid, cond_value], dim=2)
+            key = torch.cat([key_enc, key_layer], dim=2)
+            value = torch.cat([value_enc, value_layer], dim=2)
+            # concat query
+            query_enc, query_hid = query[:, :, :E], query[:, :, E:]
+            query_layer = torch.cat([query_hid, cond_query], dim=2)
+            query = torch.cat([query_enc, query_layer], dim=2)
+        hidden_states = F.scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False,
+        )
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states = hidden_states.to(query.dtype)
+        if encoder_hidden_states is not None:
+            if with_cond:
+                encoder_hidden_states, hidden_states, cond_latents = (
+                    hidden_states[:, : encoder_hidden_states.shape[1]],
+                    hidden_states[:, encoder_hidden_states.shape[1] : -cond_latents.shape[1]*cond_bs],
+                    hidden_states[:, -cond_latents.shape[1]*cond_bs :],
+                )
+                cond_latents = cond_latents.view(cond_bs, cond_latents.shape[1] // cond_bs, cond_latents.shape[2])
+                cond_latents = attn.to_out[0](cond_latents)
+                cond_latents = attn.to_out[1](cond_latents)
+            else:
+                encoder_hidden_states, hidden_states = (
+                    hidden_states[:, : encoder_hidden_states.shape[1]],
+                    hidden_states[:, encoder_hidden_states.shape[1]:],
+                )
+            # linear proj
+            hidden_states = attn.to_out[0](hidden_states)
+            # dropout
+            hidden_states = attn.to_out[1](hidden_states)
+            encoder_hidden_states = attn.to_add_out(encoder_hidden_states)
+            if with_cond:
+                return hidden_states, encoder_hidden_states, cond_latents
+            return hidden_states, encoder_hidden_states
+        else:
+            if with_cond:
+                hidden_states, cond_latents = (
+                    hidden_states[:, : -cond_latents.shape[1]*cond_bs],
+                    hidden_states[:, -cond_latents.shape[1]*cond_bs :],
+                )
+                cond_latents = cond_latents.view(cond_bs, cond_latents.shape[1] // cond_bs, cond_latents.shape[2])
+                return hidden_states, cond_latents
+            return hidden_states

dreamfuse/models/dreamfuse_flux/transformer.py ADDED Viewed

	@@ -0,0 +1,866 @@

+# Copyright 2024 Black Forest Labs, The HuggingFace Team and The InstantX Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Any, Dict, Optional, Tuple, Union
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.loaders import FluxTransformer2DLoadersMixin, FromOriginalModelMixin, PeftAdapterMixin
+from diffusers.models.attention import FeedForward
+from diffusers.models.attention_processor import (
+    Attention,
+    AttentionProcessor,
+    FluxAttnProcessor2_0,
+    FluxAttnProcessor2_0_NPU,
+    FusedFluxAttnProcessor2_0,
+)
+from dreamfuse.models.dreamfuse_flux.flux_processor import FluxAttnSharedProcessor2_0
+from diffusers.models.modeling_utils import ModelMixin
+from diffusers.models.normalization import AdaLayerNormContinuous, AdaLayerNormZero, AdaLayerNormZeroSingle
+from diffusers.utils import USE_PEFT_BACKEND, is_torch_version, logging, scale_lora_layers, unscale_lora_layers
+from diffusers.utils.import_utils import is_torch_npu_available
+from diffusers.utils.torch_utils import maybe_allow_in_graph
+from diffusers.models.embeddings import CombinedTimestepTextProjEmbeddings, FluxPosEmbed
+from diffusers.models.modeling_outputs import Transformer2DModelOutput
+from .flux_processor import CombinedTimestepGuidanceTextProjEmbeddings
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+def zero_module(module):
+    for p in module.parameters():
+        nn.init.zeros_(p)
+    return module
+class LayerNorm2d(nn.Module):
+    def __init__(self, num_channels: int, eps: float = 1e-6) -> None:
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(num_channels))
+        self.bias = nn.Parameter(torch.zeros(num_channels))
+        self.eps = eps
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        u = x.mean(1, keepdim=True)
+        s = (x - u).pow(2).mean(1, keepdim=True)
+        x = (x - u) / torch.sqrt(s + self.eps)
+        x = self.weight[:, None, None] * x + self.bias[:, None, None]
+        return x
+class CrossAttention(nn.Module):
+    def __init__(self, query_dim: int, cross_attention_dim: int, heads: int = 8, dim_head: int = 64, dropout: float = 0.0, bias: bool = False):
+        super().__init__()
+        self.heads = heads
+        self.dim_head = cross_attention_dim // heads
+        self.attn_to_q = nn.Linear(query_dim, cross_attention_dim, bias=bias)
+        self.norm_q = nn.LayerNorm(self.dim_head)
+        self.attn_to_k = nn.Linear(cross_attention_dim, cross_attention_dim, bias=bias)
+        self.norm_k = nn.LayerNorm(self.dim_head)
+        self.attn_to_v = nn.Linear(cross_attention_dim, cross_attention_dim, bias=bias)
+        self.attn_to_out = nn.ModuleList([])
+        self.attn_to_out.append(nn.Linear(query_dim, query_dim, bias=bias))
+        self.attn_to_out.append(nn.Dropout(dropout))
+        # zero init
+        with torch.no_grad():
+            self.attn_to_out[0].weight.fill_(0)
+            # self.to_out[0].bias.fill_(0)
+    def forward(self, hidden_states, encoder_hidden_states, attention_mask=None):
+        batch_size, sequence_length, _ = hidden_states.shape
+        query = self.attn_to_q(hidden_states)
+        key = self.attn_to_k(encoder_hidden_states)
+        value = self.attn_to_v(encoder_hidden_states)
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // self.heads
+        query = query.view(batch_size, -1, self.heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, self.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, self.heads, head_dim).transpose(1, 2)
+        query = self.norm_q(query)
+        key = self.norm_k(key)
+        hidden_states = F.scaled_dot_product_attention(query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False,)
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, self.heads * head_dim)
+        hidden_states = self.attn_to_out[0](hidden_states)
+        hidden_states = self.attn_to_out[1](hidden_states)
+        return hidden_states
+@maybe_allow_in_graph
+class FluxSingleTransformerBlock(nn.Module):
+    r"""
+    A Transformer block following the MMDiT architecture, introduced in Stable Diffusion 3.
+    Reference: https://arxiv.org/abs/2403.03206
+    Parameters:
+        dim (`int`): The number of channels in the input and output.
+        num_attention_heads (`int`): The number of heads to use for multi-head attention.
+        attention_head_dim (`int`): The number of channels in each head.
+        context_pre_only (`bool`): Boolean to determine if we should add some blocks associated with the
+            processing of `context` conditions.
+    """
+    def __init__(self, dim, num_attention_heads, attention_head_dim, mlp_ratio=4.0):
+        super().__init__()
+        self.mlp_hidden_dim = int(dim * mlp_ratio)
+        self.norm = AdaLayerNormZeroSingle(dim)
+        self.proj_mlp = nn.Linear(dim, self.mlp_hidden_dim)
+        self.act_mlp = nn.GELU(approximate="tanh")
+        self.proj_out = nn.Linear(dim + self.mlp_hidden_dim, dim)
+        processor = FluxAttnSharedProcessor2_0()
+        self.attn = Attention(
+            query_dim=dim,
+            cross_attention_dim=None,
+            dim_head=attention_head_dim,
+            heads=num_attention_heads,
+            out_dim=dim,
+            bias=True,
+            processor=processor,
+            qk_norm="rms_norm",
+            eps=1e-6,
+            pre_only=True,
+        )
+    def forward(
+            self,
+            hidden_states: torch.FloatTensor,
+            temb: torch.FloatTensor,
+            image_rotary_emb=None,
+            data_num_per_group=1,
+            max_sequence_length=512,
+            mix_attention: bool = True,
+            cond_temb = None,
+            cond_image_rotary_emb = None,
+            cond_latents = None,
+            joint_attention_kwargs=None,
+    ):
+        with_cond = cond_latents is not None and mix_attention
+        residual = hidden_states
+        norm_hidden_states, gate = self.norm(hidden_states, emb=temb)
+        mlp_hidden_states = self.act_mlp(self.proj_mlp(norm_hidden_states))
+        if with_cond:
+            residual_cond = cond_latents
+            norm_cond_latents, cond_gate = self.norm(cond_latents, emb=cond_temb)
+            mlp_cond_hidden_states = self.act_mlp(self.proj_mlp(norm_cond_latents))
+        joint_attention_kwargs = joint_attention_kwargs or {}
+        attn_output = self.attn(
+            hidden_states=norm_hidden_states,
+            image_rotary_emb=image_rotary_emb,
+            data_num_per_group=data_num_per_group,
+            max_sequence_length=max_sequence_length,
+            mix_attention=mix_attention,
+            cond_latents=norm_cond_latents if with_cond else None,
+            cond_image_rotary_emb=cond_image_rotary_emb if with_cond else None,
+            **joint_attention_kwargs,
+        )
+        if with_cond:
+            attn_output, cond_attn_output = attn_output
+        hidden_states = torch.cat([attn_output, mlp_hidden_states], dim=2)
+        gate = gate.unsqueeze(1)
+        hidden_states = gate * self.proj_out(hidden_states)
+        hidden_states = residual + hidden_states
+        if with_cond:
+            cond_latents = torch.cat([cond_attn_output, mlp_cond_hidden_states], dim=2)
+            cond_gate = cond_gate.unsqueeze(1)
+            cond_latents = cond_gate * self.proj_out(cond_latents)
+            cond_latents = residual_cond + cond_latents
+        if hidden_states.dtype == torch.float16:
+            hidden_states = hidden_states.clip(-65504, 65504)
+        if with_cond:
+            return hidden_states, cond_latents
+        else:
+            return hidden_states
+@maybe_allow_in_graph
+class FluxTransformerBlock(nn.Module):
+    r"""
+    A Transformer block following the MMDiT architecture, introduced in Stable Diffusion 3.
+    Reference: https://arxiv.org/abs/2403.03206
+    Parameters:
+        dim (`int`): The number of channels in the input and output.
+        num_attention_heads (`int`): The number of heads to use for multi-head attention.
+        attention_head_dim (`int`): The number of channels in each head.
+        context_pre_only (`bool`): Boolean to determine if we should add some blocks associated with the
+            processing of `context` conditions.
+    """
+    def __init__(self, dim, num_attention_heads, attention_head_dim, qk_norm="rms_norm", eps=1e-6):
+        super().__init__()
+        self.norm1 = AdaLayerNormZero(dim)
+        self.norm1_context = AdaLayerNormZero(dim)
+        processor = FluxAttnSharedProcessor2_0()
+        self.attn = Attention(
+            query_dim=dim,
+            cross_attention_dim=None,
+            added_kv_proj_dim=dim,
+            dim_head=attention_head_dim,
+            heads=num_attention_heads,
+            out_dim=dim,
+            context_pre_only=False,
+            bias=True,
+            processor=processor,
+            qk_norm=qk_norm,
+            eps=eps,
+        )
+        self.norm2 = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
+        self.ff = FeedForward(dim=dim, dim_out=dim, activation_fn="gelu-approximate")
+        self.norm2_context = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
+        self.ff_context = FeedForward(dim=dim, dim_out=dim, activation_fn="gelu-approximate")
+        # let chunk size default to None
+        self._chunk_size = None
+        self._chunk_dim = 0
+    def forward(
+            self,
+            hidden_states: torch.FloatTensor,
+            encoder_hidden_states: torch.FloatTensor,
+            temb: torch.FloatTensor,
+            image_rotary_emb=None,
+            data_num_per_group=1,
+            max_sequence_length=512,
+            mix_attention: bool = True,
+            cond_temb = None,
+            cond_image_rotary_emb = None,
+            cond_latents = None,
+            joint_attention_kwargs=None,
+    ):
+        norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(hidden_states, emb=temb)
+        norm_encoder_hidden_states, c_gate_msa, c_shift_mlp, c_scale_mlp, c_gate_mlp = self.norm1_context(
+            encoder_hidden_states, emb=temb
+        )
+        joint_attention_kwargs = joint_attention_kwargs or {}
+        with_cond = cond_latents is not None and mix_attention
+        if with_cond:
+            norm_cond_latents, cond_gate_msa, cond_shift_mlp, cond_scale_mlp, cond_gate_mlp = self.norm1(cond_latents, emb=cond_temb)
+        # Attention.
+        attention_outputs = self.attn(
+            hidden_states=norm_hidden_states,
+            encoder_hidden_states=norm_encoder_hidden_states,
+            image_rotary_emb=image_rotary_emb,
+            data_num_per_group=data_num_per_group,
+            max_sequence_length=max_sequence_length,
+            mix_attention=mix_attention,
+            cond_latents=norm_cond_latents if with_cond else None,
+            cond_image_rotary_emb=cond_image_rotary_emb if with_cond else None,
+            **joint_attention_kwargs,
+        )
+        if len(attention_outputs) == 2:
+            attn_output, context_attn_output = attention_outputs
+        elif len(attention_outputs) == 3 and with_cond:
+            attn_output, context_attn_output, cond_attn_output = attention_outputs
+        elif len(attention_outputs) == 3:
+            attn_output, context_attn_output, ip_attn_output = attention_outputs
+        # Process attention outputs for the `hidden_states`.
+        attn_output = gate_msa.unsqueeze(1) * attn_output
+        hidden_states = hidden_states + attn_output
+        norm_hidden_states = self.norm2(hidden_states)
+        norm_hidden_states = norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
+        ff_output = self.ff(norm_hidden_states)
+        ff_output = gate_mlp.unsqueeze(1) * ff_output
+        hidden_states = hidden_states + ff_output
+        if len(attention_outputs) == 3 and not with_cond:
+            hidden_states = hidden_states + ip_attn_output
+        if with_cond:
+            cond_attn_output = cond_gate_msa.unsqueeze(1) * cond_attn_output
+            cond_latents = cond_latents + cond_attn_output
+            norm_cond_latents = self.norm2(cond_latents)
+            norm_cond_latents = norm_cond_latents * (1 + cond_scale_mlp[:, None]) + cond_shift_mlp[:, None]
+            cond_ff_output = self.ff(norm_cond_latents)
+            cond_ff_output = cond_gate_mlp.unsqueeze(1) * cond_ff_output
+            cond_latents = cond_latents + cond_ff_output
+        # Process attention outputs for the `encoder_hidden_states`.
+        context_attn_output = c_gate_msa.unsqueeze(1) * context_attn_output
+        encoder_hidden_states = encoder_hidden_states + context_attn_output
+        norm_encoder_hidden_states = self.norm2_context(encoder_hidden_states)
+        norm_encoder_hidden_states = norm_encoder_hidden_states * (1 + c_scale_mlp[:, None]) + c_shift_mlp[:, None]
+        context_ff_output = self.ff_context(norm_encoder_hidden_states)
+        encoder_hidden_states = encoder_hidden_states + c_gate_mlp.unsqueeze(1) * context_ff_output
+        if encoder_hidden_states.dtype == torch.float16:
+            encoder_hidden_states = encoder_hidden_states.clip(-65504, 65504)
+        if with_cond:
+            return encoder_hidden_states, hidden_states, cond_latents
+        else:
+            return encoder_hidden_states, hidden_states
+class FluxTransformer2DModel(
+    ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginalModelMixin, FluxTransformer2DLoadersMixin
+):
+    """
+    The Transformer model introduced in Flux.
+    Reference: https://blackforestlabs.ai/announcing-black-forest-labs/
+    Parameters:
+        patch_size (`int`): Patch size to turn the input data into small patches.
+        in_channels (`int`, *optional*, defaults to 16): The number of channels in the input.
+        num_layers (`int`, *optional*, defaults to 18): The number of layers of MMDiT blocks to use.
+        num_single_layers (`int`, *optional*, defaults to 18): The number of layers of single DiT blocks to use.
+        attention_head_dim (`int`, *optional*, defaults to 64): The number of channels in each head.
+        num_attention_heads (`int`, *optional*, defaults to 18): The number of heads to use for multi-head attention.
+        joint_attention_dim (`int`, *optional*): The number of `encoder_hidden_states` dimensions to use.
+        pooled_projection_dim (`int`): Number of dimensions to use when projecting the `pooled_projections`.
+        guidance_embeds (`bool`, defaults to False): Whether to use guidance embeddings.
+    """
+    _supports_gradient_checkpointing = True
+    _no_split_modules = ["FluxTransformerBlock", "FluxSingleTransformerBlock"]
+    @register_to_config
+    def __init__(
+            self,
+            patch_size: int = 1,
+            in_channels: int = 64,
+            out_channels: Optional[int] = None,
+            num_layers: int = 19,
+            num_single_layers: int = 38,
+            attention_head_dim: int = 128,
+            num_attention_heads: int = 24,
+            joint_attention_dim: int = 4096,
+            pooled_projection_dim: int = 768,
+            guidance_embeds: bool = False,
+            axes_dims_rope: Tuple[int] = (16, 56, 56),
+    ):
+        super().__init__()
+        self.out_channels = out_channels or in_channels
+        self.inner_dim = self.config.num_attention_heads * self.config.attention_head_dim
+        if getattr(self.config, "num_image_tag_embeddings", None) is not None:
+            self.image_tag_embeddings = nn.Embedding(self.config.num_image_tag_embeddings, self.inner_dim)
+        if getattr(self.config, "num_context_tag_embeddings", None) is not None:
+            self.context_tag_embeddings = nn.Embedding(self.config.num_context_tag_embeddings, self.inner_dim)
+        self.pos_embed = FluxPosEmbed(theta=10000, axes_dim=axes_dims_rope)
+        text_time_guidance_cls = (
+            CombinedTimestepGuidanceTextProjEmbeddings if guidance_embeds else CombinedTimestepTextProjEmbeddings
+        )
+        self.time_text_embed = text_time_guidance_cls(
+            embedding_dim=self.inner_dim, pooled_projection_dim=self.config.pooled_projection_dim
+        )
+        self.context_embedder = nn.Linear(self.config.joint_attention_dim, self.inner_dim)
+        self.x_embedder = nn.Linear(self.config.in_channels, self.inner_dim)
+        self.transformer_blocks = nn.ModuleList(
+            [
+                FluxTransformerBlock(
+                    dim=self.inner_dim,
+                    num_attention_heads=self.config.num_attention_heads,
+                    attention_head_dim=self.config.attention_head_dim,
+                )
+                for i in range(self.config.num_layers)
+            ]
+        )
+        self.single_transformer_blocks = nn.ModuleList(
+            [
+                FluxSingleTransformerBlock(
+                    dim=self.inner_dim,
+                    num_attention_heads=self.config.num_attention_heads,
+                    attention_head_dim=self.config.attention_head_dim,
+                )
+                for i in range(self.config.num_single_layers)
+            ]
+        )
+        self.norm_out = AdaLayerNormContinuous(self.inner_dim, self.inner_dim, elementwise_affine=False, eps=1e-6)
+        self.proj_out = nn.Linear(self.inner_dim, patch_size * patch_size * self.out_channels, bias=True)
+        self.gradient_checkpointing = False
+    def set_tag_embeddings(self, num_image_tag_embeddings=0, num_context_tag_embeddings=0):
+        if num_image_tag_embeddings > 0:
+            self.config.num_image_tag_embeddings = num_image_tag_embeddings
+            self.image_tag_embeddings = zero_module(nn.Embedding(self.config.num_image_tag_embeddings, self.inner_dim))
+        if num_context_tag_embeddings > 0:
+            self.config.num_context_tag_embeddings = num_context_tag_embeddings
+            self.context_tag_embeddings = zero_module(nn.Embedding(self.config.num_context_tag_embeddings, self.inner_dim))
+    def set_mask_tokenizer(self, mask_in_chans, mask_out_chans, activation = nn.GELU):
+        self.mask_tokenizer = nn.Sequential(
+            nn.Conv2d(1, mask_in_chans // 4, kernel_size=2, stride=2),
+            LayerNorm2d(mask_in_chans // 4),
+            activation(),
+            nn.Conv2d(mask_in_chans // 4, mask_in_chans, kernel_size=3, padding=1),
+            LayerNorm2d(mask_in_chans),
+            activation(),
+            nn.Conv2d(mask_in_chans, mask_out_chans, kernel_size=1),
+            nn.AdaptiveAvgPool2d((16, 16))
+        )
+        self.mask_attn = CrossAttention(mask_out_chans, mask_out_chans)
+    def forward_mask_attn(self, mask_images, fg_images):
+        mask_images = self.mask_tokenizer(mask_images)
+        mask_images = mask_images.flatten(2).transpose(1, 2)
+        mask_images = self.mask_attn(mask_images, fg_images, attention_mask=None)
+        return mask_images
+    @property
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors
+    def attn_processors(self) -> Dict[str, AttentionProcessor]:
+        r"""
+        Returns:
+            `dict` of attention processors: A dictionary containing all attention processors used in the model with
+            indexed by its weight name.
+        """
+        # set recursively
+        processors = {}
+        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
+            if hasattr(module, "get_processor"):
+                processors[f"{name}.processor"] = module.get_processor()
+            for sub_name, child in module.named_children():
+                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+            return processors
+        for name, module in self.named_children():
+            fn_recursive_add_processors(name, module, processors)
+        return processors
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor
+    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
+        r"""
+        Sets the attention processor to use to compute attention.
+        Parameters:
+            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
+                The instantiated processor class or a dictionary of processor classes that will be set as the processor
+                for **all** `Attention` layers.
+                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
+                processor. This is strongly recommended when setting trainable attention processors.
+        """
+        count = len(self.attn_processors.keys())
+        if isinstance(processor, dict) and len(processor) != count:
+            raise ValueError(
+                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+            )
+        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
+            if hasattr(module, "set_processor"):
+                if not isinstance(processor, dict):
+                    module.set_processor(processor)
+                else:
+                    module.set_processor(processor.pop(f"{name}.processor"))
+            for sub_name, child in module.named_children():
+                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+        for name, module in self.named_children():
+            fn_recursive_attn_processor(name, module, processor)
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.fuse_qkv_projections with FusedAttnProcessor2_0->FusedFluxAttnProcessor2_0
+    def fuse_qkv_projections(self):
+        """
+        Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, key, value)
+        are fused. For cross-attention modules, key and value projection matrices are fused.
+        <Tip warning={true}>
+        This API is 🧪 experimental.
+        </Tip>
+        """
+        self.original_attn_processors = None
+        for _, attn_processor in self.attn_processors.items():
+            if "Added" in str(attn_processor.__class__.__name__):
+                raise ValueError("`fuse_qkv_projections()` is not supported for models having added KV projections.")
+        self.original_attn_processors = self.attn_processors
+        for module in self.modules():
+            if isinstance(module, Attention):
+                module.fuse_projections(fuse=True)
+        self.set_attn_processor(FusedFluxAttnProcessor2_0())
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.unfuse_qkv_projections
+    def unfuse_qkv_projections(self):
+        """Disables the fused QKV projection if enabled.
+        <Tip warning={true}>
+        This API is 🧪 experimental.
+        </Tip>
+        """
+        if self.original_attn_processors is not None:
+            self.set_attn_processor(self.original_attn_processors)
+    def _set_gradient_checkpointing(self, module, value=False):
+        if hasattr(module, "gradient_checkpointing"):
+            module.gradient_checkpointing = value
+    def _format_input(self):
+        pass
+    def _format_output(self):
+        pass
+    def forward(
+            self,
+            hidden_states: torch.Tensor,
+            encoder_hidden_states: torch.Tensor = None,
+            cond_input: dict = None,
+            pooled_projections: torch.Tensor = None,
+            timestep: torch.LongTensor = None,
+            img_ids: torch.Tensor = None,
+            txt_ids: torch.Tensor = None,
+            guidance: torch.Tensor = None,
+            joint_attention_kwargs: Optional[Dict[str, Any]] = None,
+            controlnet_block_samples=None,
+            controlnet_single_block_samples=None,
+            return_dict: bool = True,
+            controlnet_blocks_repeat: bool = False,
+            data_num_per_group: int = 1,
+            image_tags=None,
+            context_tags=None,
+            max_sequence_length: int = 512,
+            mix_attention_double=True,
+            mix_attention_single=True,
+    ) -> Union[torch.FloatTensor, Transformer2DModelOutput]:
+        """
+        The [`FluxTransformer2DModel`] forward method.
+        Args:
+            hidden_states (`torch.FloatTensor` of shape `(batch size, channel, height, width)`):
+                Input `hidden_states`.
+            encoder_hidden_states (`torch.FloatTensor` of shape `(batch size, sequence_len, embed_dims)`):
+                Conditional embeddings (embeddings computed from the input conditions such as prompts) to use.
+            pooled_projections (`torch.FloatTensor` of shape `(batch_size, projection_dim)`): Embeddings projected
+                from the embeddings of input conditions.
+            timestep ( `torch.LongTensor`):
+                Used to indicate denoising step.
+            block_controlnet_hidden_states: (`list` of `torch.Tensor`):
+                A list of tensors that if specified are added to the residuals of transformer blocks.
+            joint_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.transformer_2d.Transformer2DModelOutput`] instead of a plain
+                tuple.
+        Returns:
+            If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a
+            `tuple` where the first element is the sample tensor.
+        """
+        if joint_attention_kwargs is not None:
+            joint_attention_kwargs = joint_attention_kwargs.copy()
+            lora_scale = joint_attention_kwargs.pop("scale", 1.0)
+        else:
+            lora_scale = 1.0
+        if USE_PEFT_BACKEND:
+            # weight the lora layers by setting `lora_scale` for each PEFT layer
+            scale_lora_layers(self, lora_scale)
+        else:
+            if joint_attention_kwargs is not None and joint_attention_kwargs.get("scale", None) is not None:
+                logger.warning(
+                    "Passing `scale` via `joint_attention_kwargs` when not using the PEFT backend is ineffective."
+                )
+        hidden_states = self.x_embedder(hidden_states)
+        mask_cond = None
+        mask_ids = None
+        if cond_input is not None:
+            cond_image_latents = cond_input["image_latents"]
+            cond_image_ids = cond_input["image_ids"]
+            cond_latents = self.x_embedder(cond_image_latents)
+            if joint_attention_kwargs is not None and "mask_cond" in joint_attention_kwargs:
+                mask_cond = joint_attention_kwargs.pop("mask_cond")
+                mask_ids = joint_attention_kwargs.pop("mask_ids")
+                if mask_cond is not None:
+                    mask_cond = self.forward_mask_attn(mask_cond, cond_latents[:1])
+                # joint_attention_kwargs["mask_cond"] = mask_cond
+                # hidden_states = hidden_states + mask_cond
+        if image_tags is not None:
+            image_tag_embeddings = self.image_tag_embeddings(
+                torch.Tensor(
+                    image_tags,
+                ).to(device=hidden_states.device, dtype=torch.int64)
+            )
+            bsz = hidden_states.shape[0] // data_num_per_group
+            image_tag_embeddings = image_tag_embeddings.repeat_interleave(bsz, dim=0)
+            if cond_input is not None:
+                hidden_states = hidden_states + image_tag_embeddings[0]
+                cond_latents = cond_latents + image_tag_embeddings[1:].unsqueeze(1)
+            else:
+                # for debug
+                if len(hidden_states) != len(image_tag_embeddings):
+                    hidden_states += image_tag_embeddings[:1].unsqueeze(1)
+                else:
+                    hidden_states = hidden_states + image_tag_embeddings.unsqueeze(1)
+        timestep = timestep.to(hidden_states.dtype) * 1000
+        if guidance is not None:
+            guidance = guidance.to(hidden_states.dtype) * 1000
+        else:
+            guidance = None
+        temb = (
+            self.time_text_embed(timestep, pooled_projections)
+            if guidance is None
+            else self.time_text_embed(timestep, guidance, pooled_projections)
+        )
+        if cond_input is not None:
+            cond_time = 0
+            cond_temb = ( self.time_text_embed(torch.ones_like(timestep)*cond_time, pooled_projections)
+                if guidance is None
+                else self.time_text_embed(torch.ones_like(timestep)*cond_time, guidance, pooled_projections)
+            )
+        encoder_hidden_states = self.context_embedder(encoder_hidden_states)
+        if context_tags is not None:
+            context_tag_embeddings = self.context_tag_embeddings(
+                torch.Tensor(
+                    image_tags,
+                ).to(device=hidden_states.device, dtype=torch.int64)
+            )
+            bsz = hidden_states.shape[0] // data_num_per_group
+            context_tag_embeddings = context_tag_embeddings.repeat_interleave(bsz, dim=0)
+            if cond_input is not None:
+                encoder_hidden_states = encoder_hidden_states + context_tag_embeddings[0]
+            else:
+                if len(encoder_hidden_states) != len(context_tag_embeddings):
+                    encoder_hidden_states += context_tag_embeddings[:1].unsqueeze(1)
+                else:
+                    encoder_hidden_states = encoder_hidden_states + context_tag_embeddings.unsqueeze(1)
+        if mask_cond is not None:
+            encoder_hidden_states = torch.cat([encoder_hidden_states, mask_cond], dim=1) # todo: compare with add
+            max_sequence_length = encoder_hidden_states.shape[1]
+            txt_ids = torch.cat((txt_ids, mask_ids), dim=0)
+        if isinstance(img_ids, list):
+            image_rotary_emb = []
+            for img_ids_ in img_ids:
+                ids = torch.cat((txt_ids, img_ids_), dim=0)
+                image_rotary_emb.append(self.pos_embed(ids))
+            image_rotary_emb = (  # to batch, cos / sin
+                torch.stack([_[0] for _ in image_rotary_emb]).repeat_interleave(hidden_states.shape[0] // len(img_ids), dim=0).clone(),
+                torch.stack([_[1] for _ in image_rotary_emb]).repeat_interleave(hidden_states.shape[0] // len(img_ids), dim=0).clone(),
+            )
+        else:
+            ids = torch.cat((txt_ids, img_ids), dim=0)
+            image_rotary_emb = self.pos_embed(ids)
+            if cond_input is not None:
+                cond_rotary_emb = []
+                for image_ids in cond_image_ids:
+                    cond_rotary_emb.append(self.pos_embed(image_ids))
+                cond_rotary_emb = (
+                    torch.stack([_[0] for _ in cond_rotary_emb]).repeat_interleave(cond_latents.shape[0] // len(cond_image_ids), dim=0).clone(),
+                    torch.stack([_[1] for _ in cond_rotary_emb]).repeat_interleave(cond_latents.shape[0] // len(cond_image_ids), dim=0).clone(),
+                )
+        if joint_attention_kwargs is not None and "ip_adapter_image_embeds" in joint_attention_kwargs:
+            ip_adapter_image_embeds = joint_attention_kwargs.pop("ip_adapter_image_embeds")
+            ip_hidden_states = self.encoder_hid_proj(ip_adapter_image_embeds)
+            joint_attention_kwargs.update({"ip_hidden_states": ip_hidden_states})
+        for index_block, block in enumerate(self.transformer_blocks):
+            if torch.is_grad_enabled() and self.gradient_checkpointing:
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+                    return custom_forward
+                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                # ckpt_kwargs.updata(joint_attention_kwargs)
+                block_output = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(block),
+                    hidden_states,
+                    encoder_hidden_states,
+                    temb,
+                    image_rotary_emb,
+                    data_num_per_group,
+                    max_sequence_length,
+                    mix_attention_double,
+                    cond_temb if cond_input is not None else None,
+                    cond_rotary_emb if cond_input is not None else None,
+                    cond_latents if cond_input is not None else None,
+                    joint_attention_kwargs,
+                    **ckpt_kwargs,
+                )
+            else:
+                block_output = block(
+                    hidden_states=hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    temb=temb,
+                    image_rotary_emb=image_rotary_emb,
+                    data_num_per_group=data_num_per_group,
+                    max_sequence_length=max_sequence_length,
+                    mix_attention=mix_attention_double,
+                    cond_temb = cond_temb if cond_input is not None else None,
+                    cond_image_rotary_emb = cond_rotary_emb if cond_input is not None else None,
+                    cond_latents = cond_latents if cond_input is not None else None,
+                    joint_attention_kwargs=joint_attention_kwargs,
+                )
+            if cond_input is not None and mix_attention_double:
+                encoder_hidden_states, hidden_states, cond_latents = block_output
+            else:
+                encoder_hidden_states, hidden_states = block_output
+            # controlnet residual
+            if controlnet_block_samples is not None:
+                interval_control = len(self.transformer_blocks) / len(controlnet_block_samples)
+                interval_control = int(np.ceil(interval_control))
+                # For Xlabs ControlNet.
+                if controlnet_blocks_repeat:
+                    hidden_states = (
+                            hidden_states + controlnet_block_samples[index_block % len(controlnet_block_samples)]
+                    )
+                else:
+                    hidden_states = hidden_states + controlnet_block_samples[index_block // interval_control]
+        hidden_states = torch.cat([encoder_hidden_states, hidden_states], dim=1)
+        for index_block, block in enumerate(self.single_transformer_blocks):
+            if torch.is_grad_enabled() and self.gradient_checkpointing:
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+                    return custom_forward
+                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(block),
+                    hidden_states,
+                    temb,
+                    image_rotary_emb,
+                    data_num_per_group,
+                    max_sequence_length,
+                    mix_attention_single,
+                    cond_temb if cond_input is not None else None,
+                    cond_rotary_emb if cond_input is not None else None,
+                    cond_latents if cond_input is not None else None,
+                    joint_attention_kwargs,
+                    **ckpt_kwargs,
+                )
+            else:
+                hidden_states = block(
+                    hidden_states=hidden_states,
+                    temb=temb,
+                    image_rotary_emb=image_rotary_emb,
+                    data_num_per_group=data_num_per_group,
+                    max_sequence_length=max_sequence_length,
+                    mix_attention=mix_attention_single,
+                    cond_temb = cond_temb if cond_input is not None else None,
+                    cond_image_rotary_emb = cond_rotary_emb if cond_input is not None else None,
+                    cond_latents = cond_latents if cond_input is not None else None,
+                    joint_attention_kwargs=joint_attention_kwargs,
+                )
+            if cond_input is not None and mix_attention_single:
+                hidden_states, cond_latents = hidden_states
+            # controlnet residual
+            if controlnet_single_block_samples is not None:
+                interval_control = len(self.single_transformer_blocks) / len(controlnet_single_block_samples)
+                interval_control = int(np.ceil(interval_control))
+                hidden_states[:, encoder_hidden_states.shape[1]:, ...] = (
+                        hidden_states[:, encoder_hidden_states.shape[1]:, ...]
+                        + controlnet_single_block_samples[index_block // interval_control]
+                )
+        hidden_states = hidden_states[:, encoder_hidden_states.shape[1]:, ...]
+        hidden_states = self.norm_out(hidden_states, temb)
+        output = self.proj_out(hidden_states)
+        if USE_PEFT_BACKEND:
+            # remove `lora_scale` from each PEFT layer
+            unscale_lora_layers(self, lora_scale)
+        if not return_dict:
+            return (output,)
+        return Transformer2DModelOutput(sample=output)

dreamfuse/trains/utils/__pycache__/inference_utils.cpython-310.pyc ADDED Viewed

Binary file (8.68 kB). View file

dreamfuse/trains/utils/inference_utils.py ADDED Viewed

	@@ -0,0 +1,386 @@

+import torch
+from diffusers.utils.torch_utils import randn_tensor
+import numpy as np
+from einops import rearrange
+import torch.nn.functional as F
+def get_mask_affine(mask1, mask2):
+    box1 = mask1.getbbox()
+    box2 = mask2.getbbox()
+    if box1 is None or box2 is None:
+        affine_coeffs = [1, 0, 0, 0, 1, 0]
+        return affine_coeffs
+    left1, top1, right1, bottom1 = box1
+    left2, top2, right2, bottom2 = box2
+    w1, h1 = right1 - left1, bottom1 - top1
+    w2, h2 = right2 - left2, bottom2 - top2
+    scale_x = w1 / w2
+    scale_y = h1 / h2
+    tx = left1 - left2*scale_x
+    ty = top1 - top2*scale_y
+    affine_coeffs = [scale_x, 0, tx, 0, scale_y, ty]
+    return affine_coeffs
+def tokenize_prompt(tokenizer, prompt, max_sequence_length):
+    text_inputs = tokenizer(
+        prompt,
+        padding="max_length",
+        max_length=max_sequence_length,
+        truncation=True,
+        return_length=False,
+        return_overflowing_tokens=False,
+        return_tensors="pt",
+    )
+    text_input_ids = text_inputs.input_ids
+    return text_input_ids
+def _encode_prompt_with_t5(
+        text_encoder,
+        tokenizer,
+        max_sequence_length=512,
+        prompt=None,
+        num_images_per_prompt=1,
+        device=None,
+        text_input_ids=None,
+):
+    prompt = [prompt] if isinstance(prompt, str) else prompt
+    batch_size = len(prompt)
+    if tokenizer is not None:
+        text_inputs = tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=max_sequence_length,
+            truncation=True,
+            return_length=False,
+            return_overflowing_tokens=False,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+    else:
+        if text_input_ids is None:
+            raise ValueError("text_input_ids must be provided when the tokenizer is not specified")
+    prompt_embeds = text_encoder(text_input_ids.to(device))[0]
+    dtype = text_encoder.dtype
+    prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+    _, seq_len, _ = prompt_embeds.shape
+    # duplicate text embeddings and attention mask for each generation per prompt, using mps friendly method
+    prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+    prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+    return prompt_embeds
+def _encode_prompt_with_clip(
+        text_encoder,
+        tokenizer,
+        prompt: str,
+        device=None,
+        text_input_ids=None,
+        num_images_per_prompt: int = 1,
+):
+    prompt = [prompt] if isinstance(prompt, str) else prompt
+    batch_size = len(prompt)
+    if tokenizer is not None:
+        text_inputs = tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=77,
+            truncation=True,
+            return_overflowing_tokens=False,
+            return_length=False,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+    else:
+        if text_input_ids is None:
+            raise ValueError("text_input_ids must be provided when the tokenizer is not specified")
+    prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=False)
+    # Use pooled output of CLIPTextModel
+    prompt_embeds = prompt_embeds.pooler_output
+    prompt_embeds = prompt_embeds.to(dtype=text_encoder.dtype, device=device)
+    # duplicate text embeddings for each generation per prompt, using mps friendly method
+    prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+    prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, -1)
+    return prompt_embeds
+def compute_text_embeddings(config, prompt, text_encoders, tokenizers, device):
+    with torch.no_grad():
+        prompt_embeds, pooled_prompt_embeds, text_ids = encode_prompt(
+            text_encoders, tokenizers, prompt, config.max_sequence_length
+        )
+        prompt_embeds = prompt_embeds.to(device)
+        pooled_prompt_embeds = pooled_prompt_embeds.to(device)
+        text_ids = text_ids.to(device)
+    return prompt_embeds, pooled_prompt_embeds, text_ids
+def _prepare_image_ids(height, width, offset_h=0, offset_w=0):
+    image_ids = torch.zeros(height, width, 3)
+    image_ids[..., 1] = image_ids[..., 1] + torch.arange(height)[:, None] + offset_h
+    image_ids[..., 2] = image_ids[..., 2] + torch.arange(width)[None, :] + offset_w
+    image_ids = image_ids.reshape(-1, 3)
+    return image_ids
+def _pack_latents(latents, batch_size, num_channels_latents, height, width):
+    latents = latents.view(
+        batch_size, num_channels_latents, height // 2, 2, width // 2, 2
+    )
+    latents = latents.permute(0, 2, 4, 1, 3, 5)
+    latents = latents.reshape(
+        batch_size, (height // 2) * (width // 2), num_channels_latents * 4
+    )
+    return latents
+def _unpack_latents(latents, height, width, vae_downsample_factor):
+    batch_size, num_patches, channels = latents.shape
+    # VAE applies 8x compression on images but we must also account for packing which requires
+    # latent height and width to be divisible by 2.
+    height = 2 * (int(height) // (vae_downsample_factor * 2))
+    width = 2 * (int(width) // (vae_downsample_factor * 2))
+    latents = latents.view(batch_size, height // 2, width // 2, channels // 4, 2, 2)
+    latents = latents.permute(0, 3, 1, 4, 2, 5)
+    latents = latents.reshape(batch_size, channels // (2 * 2), height, width)
+    return latents
+def _prepare_latent_image_ids(batch_size, height, width, device, dtype, offset_h=0, offset_w=0):
+    latent_image_ids = torch.zeros(height, width, 3)
+    latent_image_ids[..., 1] = (
+            latent_image_ids[..., 1] + torch.arange(height)[:, None] + offset_h
+    )
+    latent_image_ids[..., 2] = (
+            latent_image_ids[..., 2] + torch.arange(width)[None, :] + offset_w
+    )
+    latent_image_id_height, latent_image_id_width, latent_image_id_channels = (
+        latent_image_ids.shape
+    )
+    latent_image_ids = latent_image_ids.reshape(
+        latent_image_id_height * latent_image_id_width, latent_image_id_channels
+    )
+    return latent_image_ids.to(device=device, dtype=dtype)
+def pil_to_tensor(image, device="cpu"):
+    image = np.array(image)
+    image = torch.from_numpy(image).float() / 127.5 - 1.0
+    image = image.permute(2, 0, 1).to(device)
+    return image
+@torch.no_grad()
+def encode_images_cond(vae_model, condition_images, device):
+    condition_image_tensors = []
+    for condition_image in condition_images:
+        condition_image_tensor = torch.tensor(np.array(condition_image)).to(device).permute(0, 3, 1, 2) # shape: [n_cond, c, h, w]
+        condition_image_tensor = condition_image_tensor / 127.5 - 1.0
+        condition_image_tensors.append(condition_image_tensor)
+    condition_image_tensors = torch.stack(condition_image_tensors) # shape: [bs, n_cond, c, h, w]
+    condition_image_tensors = rearrange(condition_image_tensors, 'b n c h w -> (b n) c h w')
+    # encode condition images
+    condition_image_latents = (
+        vae_model.encode(
+            condition_image_tensors.to(vae_model.dtype)
+        ).latent_dist.sample()
+    ) # shape: [bs*n_cond, c, h // 8, w // 8]
+    condition_image_latents = (condition_image_latents - vae_model.config.shift_factor) * vae_model.config.scaling_factor
+    return condition_image_latents
+def prepare_latents(
+        batch_size,
+        num_channels_latents,
+        vae_downsample_factor,
+        height,
+        width,
+        dtype,
+        device,
+        generator,
+        latents=None,
+        offset=None,
+        hw=False,
+):
+    # VAE applies 8x compression on images but we must also account for packing which requires
+    # latent height and width to be divisible by 2.
+    height = 2 * (int(height) // (vae_downsample_factor * 2))
+    width = 2 * (int(width) // (vae_downsample_factor * 2))
+    shape = (batch_size, num_channels_latents, height, width)
+    if latents is not None:
+        if offset is None:
+            latent_image_ids = _prepare_latent_image_ids(
+                batch_size, height // 2, width // 2, device, dtype
+            )
+        else:
+            latent_image_ids = []
+            for offset_ in offset:
+                latent_image_ids.append(
+                    _prepare_latent_image_ids(
+                        batch_size, height // 2, width // 2, device, dtype, offset_w=offset_ * width // 2, offset_h=offset_ * height // 2 if hw else 0
+                    )
+                )
+        return latents.to(device=device, dtype=dtype), latent_image_ids
+    if isinstance(generator, list) and len(generator) != batch_size:
+        raise ValueError(
+            f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+            f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+        )
+    latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+    latents = _pack_latents(
+        latents, batch_size, num_channels_latents, height, width
+    )
+    if offset is None:
+        latent_image_ids = _prepare_latent_image_ids(
+            batch_size, height // 2, width // 2, device, dtype
+        )
+    else:
+        latent_image_ids = []
+        for offset_ in offset:
+            latent_image_ids.append(
+                _prepare_latent_image_ids(
+                    batch_size, height // 2, width // 2, device, dtype, offset_w=offset_ * width // 2,  offset_h=offset_ * height // 2 if hw else 0
+                )
+            )
+    return latents, latent_image_ids
+@torch.no_grad()
+def encode_prompt(
+        text_encoders,
+        tokenizers,
+        prompt: str,
+        max_sequence_length,
+        device=None,
+        num_images_per_prompt: int = 1,
+        text_input_ids_list=None,
+):
+    prompt = [prompt] if isinstance(prompt, str) else prompt
+    dtype = text_encoders[0].dtype
+    pooled_prompt_embeds = _encode_prompt_with_clip(
+        text_encoder=text_encoders[0],
+        tokenizer=tokenizers[0],
+        prompt=prompt,
+        device=device if device is not None else text_encoders[0].device,
+        num_images_per_prompt=num_images_per_prompt,
+        text_input_ids=text_input_ids_list[0] if text_input_ids_list else None,
+    )
+    prompt_embeds = _encode_prompt_with_t5(
+        text_encoder=text_encoders[1],
+        tokenizer=tokenizers[1],
+        max_sequence_length=max_sequence_length,
+        prompt=prompt,
+        num_images_per_prompt=num_images_per_prompt,
+        device=device if device is not None else text_encoders[1].device,
+        text_input_ids=text_input_ids_list[1] if text_input_ids_list else None,
+    )
+    text_ids = torch.zeros(prompt_embeds.shape[1], 3).to(device=device, dtype=dtype)
+    return prompt_embeds, pooled_prompt_embeds, text_ids
+def warp_affine_tensor(input_tensor, mask_affines, output_size, scale_factor=1/16,
+                       align_corners_grid=False, align_corners_sample=True,
+                       flatten_output=True, device=None):
+    """
+    对输入的 tensor 应用 affine 仿射变换，并返回 warp 后的结果。
+    参数：
+      input_tensor: 待变换的图像 tensor，支持的形状包括 (H, W, C)、(C, H, W) 或 (1, C, H, W)。
+      mask_affines: 仿射参数（例如 [a, 0, tₓ, 0, e, t_y]），这些参数单位基于 512×512 图像。
+      output_size: 目标输出的空间尺寸，格式为 (H_out, W_out)。
+      scale_factor: 平移参数的缩放因子；例如若 512→32，则 factor = 32/512 = 1/16。
+      align_corners_grid: 传递给 F.affine_grid 的 align_corners 参数。
+      align_corners_sample: 传递给 F.grid_sample 的 align_corners 参数。
+      flatten_output: 若为 True，则将输出 warp 后的 tensor 从 (1, C, H_out, W_out) 转换为 (-1, C)。
+      device: 如果设置，将将相关 tensor 移动到指定的设备上。
+    返回：
+      warped_output: 经过 affine warp 处理后的 tensor，
+                      若 flatten_output 为 True，则形状为 (H_out*W_out, C)，否则为 (1, C, H_out, W_out)。
+    """
+    # 如果输入 tensor 不是 batch（4D）的，则调整为 (1, C, H, W)
+    if input_tensor.dim() == 3:
+        # 判断是否为 (H, W, C)，如果最后一维为 3，则认为是 RGB
+        if input_tensor.shape[-1] == 3:
+            input_tensor = input_tensor.permute(2, 0, 1)
+        input_tensor = input_tensor.unsqueeze(0)
+    elif input_tensor.dim() != 4:
+        raise ValueError("input_tensor 必须是 3D 或 4D Tensor！")
+    # 输出尺寸
+    H_out, W_out = output_size
+    B, C, H_in, W_in = input_tensor.shape
+    # 将 mask_affines 转换为 tensor，确保形状为 (1, 6)
+    if not torch.is_tensor(mask_affines):
+        theta = torch.tensor(mask_affines, dtype=torch.float32).unsqueeze(0)
+    else:
+        theta = mask_affines.clone().float()
+        if theta.dim() == 1:
+            theta = theta.unsqueeze(0)
+    # 调整平移部分（第三和第六个元素），使其适应当前目标分辨率
+    theta[0, 2] *= scale_factor  # x 方向平移
+    theta[0, 5] *= scale_factor  # y 方向平移
+    a   = theta[0, 0]
+    t_x = theta[0, 2]
+    e   = theta[0, 4]
+    t_y = theta[0, 5]
+    # 根据归一化转换（范围 [-1, 1]）
+    # 对 x 方向：归一化公式为 x_norm = 2*x/(W_out-1) - 1
+    # 转换后 affine 的常数项即为：a + 2*t_x/(W_out-1) - 1
+    theta_norm = torch.tensor([
+        [a, 0.0, a + 2*t_x/(W_out - 1) - 1],
+        [0.0, e, e + 2*t_y/(H_out - 1) - 1]
+    ], dtype=torch.float32).unsqueeze(0)
+    # 根据目标输出大小创建 affine_grid，grid 的 size 为 (B, C, H_out, W_out)
+    grid = F.affine_grid(theta_norm, size=(B, C, H_out, W_out), align_corners=align_corners_grid)
+    if device is not None:
+        grid = grid.to(device)
+        input_tensor = input_tensor.to(device)
+    # 对输入 tensor 进行采样
+    warped = F.grid_sample(input_tensor, grid, align_corners=align_corners_sample)
+    # 若需要将输出展平为 (-1, C)
+    if flatten_output:
+        # 将 (1, C, H_out, W_out) → 转为 (H_out, W_out, C) → reshape(-1, C)
+        warped = warped.squeeze(0).permute(1, 2, 0).reshape(-1, C)
+    return warped

dreamfuse_inference.py ADDED Viewed

	@@ -0,0 +1,642 @@

+import gc
+import os
+from typing import List
+import contextlib
+import torch.multiprocessing as mp
+from dataclasses import dataclass, field
+from collections import defaultdict
+import random
+import numpy as np
+from PIL import Image, ImageOps
+import json
+import torch
+from peft import PeftModel
+import torch.nn.functional as F
+import accelerate
+import diffusers
+from diffusers import FluxPipeline
+from diffusers.utils.torch_utils import is_compiled_module
+import transformers
+from tqdm import tqdm
+from peft import LoraConfig, set_peft_model_state_dict
+from peft.utils import get_peft_model_state_dict
+from dreamfuse.models.dreamfuse_flux.transformer import (
+    FluxTransformer2DModel,
+    FluxTransformerBlock,
+    FluxSingleTransformerBlock,
+)
+from diffusers.schedulers.scheduling_flow_match_euler_discrete import (
+    FlowMatchEulerDiscreteScheduler,
+)
+from diffusers.pipelines.flux.pipeline_flux import calculate_shift, retrieve_timesteps
+from dreamfuse.trains.utils.inference_utils import (
+    compute_text_embeddings,
+    prepare_latents,
+    _unpack_latents,
+    _pack_latents,
+    _prepare_image_ids,
+    encode_images_cond,
+    get_mask_affine,
+    warp_affine_tensor
+)
+def seed_everything(seed):
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    random.seed(seed)
+    np.random.seed(seed)
+@dataclass
+class InferenceConfig:
+    # Model paths
+    flux_model_id: str = 'black-forest-labs/FLUX.1-dev'
+    lora_id: str = ''
+    model_choice: str = 'dev'
+    # Model configs
+    lora_rank: int = 16
+    max_sequence_length: int = 256
+    guidance_scale: float = 3.5
+    num_inference_steps: int = 28
+    mask_ids: int = 16
+    mask_in_chans: int = 128
+    mask_out_chans: int = 3072
+    inference_scale = 1024
+    # Training configs
+    gradient_checkpointing: bool = False
+    mix_attention_double: bool = True
+    mix_attention_single: bool = True
+    # Image processing
+    image_ids_offset: List[int] = field(default_factory=lambda: [0, 0, 0])
+    image_tags: List[int] = field(default_factory=lambda: [0, 1, 2])
+    context_tags: List[int] = None
+    # Runtime configs
+    device: str = "cuda:0" # if torch.cuda.is_available() else "cpu"
+    dtype: torch.dtype = torch.bfloat16
+    seed: int = 1234
+    debug: bool = True
+    # I/O configs
+    valid_output_dir: str = "./inference_output"
+    valid_roots: List[str] = field(default_factory=lambda: [
+        "./",
+    ])
+    valid_jsons: List[str] = field(default_factory=lambda: [
+        "./examples/data_dreamfuse.json",
+    ])
+    ref_prompts: str = ""
+    truecfg: bool = False
+    text_strength: int = 5
+    # multi gpu
+    sub_idx:int = 0
+    total_num:int = 1
+def adjust_fg_to_bg(image: Image.Image, mask: Image.Image, target_size: tuple) -> tuple[Image.Image, Image.Image]:
+    width, height = image.size
+    target_w, target_h = target_size
+    scale = min(target_w / width, target_h / height)
+    if scale < 1:
+        new_w = int(width * scale)
+        new_h = int(height * scale)
+        image = image.resize((new_w, new_h))
+        mask = mask.resize((new_w, new_h))
+        width, height = new_w, new_h
+    pad_w = target_w - width
+    pad_h = target_h - height
+    padding = (
+        pad_w // 2,  # left
+        pad_h // 2,  # top
+        (pad_w + 1) // 2,  # right
+        (pad_h + 1) // 2   # bottom
+    )
+    image = ImageOps.expand(image, border=padding, fill=(255, 255, 255))
+    mask = ImageOps.expand(mask, border=padding, fill=0)
+    return image, mask
+def find_nearest_bucket_size(input_width, input_height, mode="x64", bucket_size=1024):
+    """
+    Finds the nearest bucket size for the given input size.
+    """
+    buckets = {
+        512: [[ 256, 768 ], [ 320, 768 ], [ 320, 704 ], [ 384, 640 ], [ 448, 576 ], [ 512, 512 ], [ 576, 448 ], [ 640, 384 ], [ 704, 320 ], [ 768, 320 ], [ 768, 256 ]],
+        768: [[ 384, 1152 ], [ 480, 1152 ], [ 480, 1056 ], [ 576, 960 ], [ 672, 864 ], [ 768, 768 ], [ 864, 672 ], [ 960, 576 ], [ 1056, 480 ], [ 1152, 480 ], [ 1152, 384 ]],
+        1024: [[ 512, 1536 ], [ 640, 1536 ], [ 640, 1408 ], [ 768, 1280 ], [ 896, 1152 ], [ 1024, 1024 ], [ 1152, 896 ], [ 1280, 768 ], [ 1408, 640 ], [ 1536, 640 ], [ 1536, 512 ]]
+    }
+    buckets = buckets[bucket_size]
+    aspect_ratios = [w / h for (w, h) in buckets]
+    assert mode in ["x64", "x8"]
+    if mode == "x64":
+        asp = input_width / input_height
+        diff = [abs(ar - asp) for ar in aspect_ratios]
+        bucket_id = int(np.argmin(diff))
+        gen_width, gen_height = buckets[bucket_id]
+    elif mode == "x8":
+        max_pixels = 1024 * 1024
+        ratio = (max_pixels / (input_width * input_height)) ** (0.5)
+        gen_width, gen_height = round(input_width * ratio), round(input_height * ratio)
+        gen_width = gen_width - gen_width % 8
+        gen_height = gen_height - gen_height % 8
+    else:
+        raise NotImplementedError
+    return (gen_width, gen_height)
+def make_image_grid(images, rows, cols, size=None):
+    assert len(images) == rows * cols
+    if size is not None:
+        images = [img.resize((size[0], size[1])) for img in images]
+    w, h = images[0].size
+    grid = Image.new("RGB", size=(cols * w, rows * h))
+    for i, img in enumerate(images):
+        grid.paste(img.convert("RGB"), box=(i % cols * w, i // cols * h))
+    return grid
+class DreamFuseInference:
+    def __init__(self, config: InferenceConfig):
+        self.config = config
+        print(config.device)
+        self.device = torch.device(config.device)
+        torch.backends.cuda.matmul.allow_tf32 = True
+        seed_everything(config.seed)
+        self._init_models()
+    def _init_models(self):
+        # Initialize tokenizers
+        self.tokenizer_one = transformers.CLIPTokenizer.from_pretrained(
+            self.config.flux_model_id, subfolder="tokenizer"
+        )
+        self.tokenizer_two = transformers.T5TokenizerFast.from_pretrained(
+            self.config.flux_model_id, subfolder="tokenizer_2"
+        )
+        # Initialize text encoders
+        self.text_encoder_one = transformers.CLIPTextModel.from_pretrained(
+            self.config.flux_model_id, subfolder="text_encoder"
+        ).to(device=self.device, dtype=self.config.dtype)
+        self.text_encoder_two = transformers.T5EncoderModel.from_pretrained(
+            self.config.flux_model_id, subfolder="text_encoder_2"
+        ).to(device=self.device, dtype=self.config.dtype)
+        # Initialize VAE
+        self.vae = diffusers.AutoencoderKL.from_pretrained(
+            self.config.flux_model_id, subfolder="vae"
+        ).to(device=self.device, dtype=self.config.dtype)
+        # Initialize denoising model
+        self.denoise_model = FluxTransformer2DModel.from_pretrained(
+            self.config.flux_model_id, subfolder="transformer"
+        ).to(device=self.device, dtype=self.config.dtype)
+        if self.config.image_tags is not None or self.config.context_tags is not None:
+            num_image_tag_embeddings = max(self.config.image_tags) + 1 if self.config.image_tags is not None else 0
+            num_context_tag_embeddings = max(self.config.context_tags) + 1 if self.config.context_tags is not None else 0
+            self.denoise_model.set_tag_embeddings(
+                num_image_tag_embeddings=num_image_tag_embeddings,
+                num_context_tag_embeddings=num_context_tag_embeddings,
+            )
+        # Add LoRA
+        self.denoise_model = PeftModel.from_pretrained(
+            self.denoise_model,
+            self.config.lora_id,
+            adapter_weights=[1.0],
+            device_map={"": self.device}
+        )
+        # Initialize scheduler
+        self.scheduler = FlowMatchEulerDiscreteScheduler.from_pretrained(
+            self.config.flux_model_id, subfolder="scheduler"
+        )
+        # Set models to eval mode
+        for model in [self.text_encoder_one, self.text_encoder_two, self.vae, self.denoise_model]:
+            model.eval()
+            model.requires_grad_(False)
+    def _compute_text_embeddings(self, prompt):
+        return compute_text_embeddings(
+            self.config,
+            prompt,
+            [self.text_encoder_one, self.text_encoder_two],
+            [self.tokenizer_one, self.tokenizer_two],
+            self.device
+        )
+    def resize_to_fit_within(self, reference_image, target_image):
+        ref_width, ref_height = reference_image.size
+        target_width, target_height = target_image.size
+        scale_width = ref_width / target_width
+        scale_height = ref_height / target_height
+        scale = min(scale_width, scale_height)  # 选择最小的缩放比例，确保不超出参考图片的宽高
+        new_width = int(target_width * scale)
+        new_height = int(target_height * scale)
+        resized_image = target_image.resize((new_width, new_height), Image.LANCZOS)
+        return resized_image
+    def pad_or_crop(self, img, target_size, fill_color=(255, 255, 255)):
+        """
+        将输入图像按中心对齐，裁剪或填充到 target_size 大小。
+        参数：
+        img         - PIL.Image 对象
+        target_size - 目标尺寸 (width, height)
+        fill_color  - 填充颜色，默认为白色
+        返回：
+        调整后的 PIL.Image 对象，尺寸为 target_size
+        """
+        iw, ih = img.size
+        tw, th = target_size
+        # 计算裁剪区域：若原图大于目标尺寸，则裁剪出中间部分；否则全部保留
+        left = (iw - tw) // 2 if iw >= tw else 0
+        top = (ih - th) // 2 if ih >= th else 0
+        cropped = img.crop((left, top, left + min(iw, tw), top + min(ih, th)))
+        # 新建目标尺寸的图像，并将裁剪后的图像居中粘贴
+        new_img = Image.new(img.mode, target_size, fill_color)
+        offset = ((tw - cropped.width) // 2, (th - cropped.height) // 2)
+        new_img.paste(cropped, offset)
+        return new_img
+    def transform_foreground_original(self, original_fg, original_bg, transformation_info, canvas_size=400):
+        """
+        根据 transformation_info 中的信息对原始前景图（original_fg）进行平移处理，
+        要求：
+        1. 输出图像大小与 original_fg 相同（保持原始前景图大小）；
+        2. 位移计算时，还原为未缩放的拖拽坐标，即用 drag_left/drag_top 除以 scale_ratio；
+        3. 拖拽产生的相对位移比例在 400x400 预览画布下相对于未缩放时默认（居中）位置计算，
+            然后按此比例推算到原始前景图尺寸下的实际位移（像素数）。
+        4. 结果在原始前景图大小的白底（未覆盖区域填充白色）中粘贴前景图。
+        参数：
+        original_fg: 原始上传的前景图（PIL Image 对象）
+        transformation_info: 字典，必须包含以下字段：
+            - "drag_left": 拖拽后当前显示的前景图左上角横坐标（受缩放影响，单位像素）
+            - "drag_top":  拖拽后当前显示的前景图左上角纵坐标（受缩放影响，单位像素）
+            - "scale_ratio": 预览时前景图缩放比例
+            - "data_original_width": 前景图在预览中未缩放时的宽度
+            - "data_original_height": 前景图在预览中未缩放时的高度
+        canvas_size: 预览画布尺寸（默认400，与前端保持一致）
+        返回：
+        处理后的图像（PIL Image 对象），大小与 original_fg 相同，
+        并根据未缩放时拖拽的相对位移结果进行了平移。
+        """
+        # 读取 transformation_info 中的参数
+        drag_left = float(transformation_info.get("drag_left", 0))
+        drag_top  = float(transformation_info.get("drag_top", 0))
+        scale_ratio = float(transformation_info.get("scale_ratio", 1))
+        data_orig_width  = float(transformation_info.get("data_original_width", canvas_size))
+        data_orig_height = float(transformation_info.get("data_original_height", canvas_size))
+        drag_width = float(transformation_info.get("drag_width", 0))
+        drag_height = float(transformation_info.get("drag_height", 0))
+        scale_ori_fg = canvas_size / max(original_fg.width, original_fg.height)
+        scale_ori_bg = canvas_size / max(original_bg.width, original_bg.height)
+        # 计算未缩放状态下（预览中）的默认居中位置（前景图未拖拽时的理想位置）
+        default_left = (canvas_size - data_orig_width) / 2.0
+        default_top  = (canvas_size - data_orig_height) / 2.0
+        # 在未缩放状态下，计算实际拖拽产生的偏移（单位：像素，在预览尺寸下计算）
+        offset_preview_x = drag_left - default_left
+        offset_preview_y = drag_top - default_top
+        offset_ori_x = offset_preview_x / scale_ori_fg
+        offset_ori_y = offset_preview_y / scale_ori_fg
+        new_width = int(original_fg.width * scale_ratio)
+        new_height = int(original_fg.height * scale_ratio)
+        scale_fg = original_fg.resize((new_width, new_height))
+        output = Image.new("RGBA", (original_fg.width, original_fg.height), (255, 255, 255, 0))
+        output.paste(scale_fg, (int(offset_ori_x), int(offset_ori_y)))
+        new_width_fgbg = original_fg.width * scale_ori_fg / scale_ori_bg
+        new_height_fgbg = original_fg.height * scale_ori_fg / scale_ori_bg
+        scale_fgbg = output.resize((int(new_width_fgbg), int(new_height_fgbg)))
+        final_output = Image.new("RGBA", (original_bg.width, original_bg.height), (255, 255, 255, 0))
+        scale_fgbg = self.pad_or_crop(scale_fgbg, (original_bg.width, original_bg.height), (255, 255, 255, 0))
+        final_output.paste(scale_fgbg, (0, 0))
+        fit_fg = self.resize_to_fit_within(original_bg, original_fg)
+        fit_fg = self.pad_or_crop(fit_fg, original_bg.size, (255, 255, 255, 0))
+        return final_output, fit_fg
+    @torch.inference_mode()
+    def gradio_generate(self, background_img, foreground_img, transformation_info, seed, prompt, enable_gui, cfg=3.5, size_select="1024", text_strength=1, truecfg=False):
+        print("!"*10)
+        """使用 DreamFuseInference 进行模型推理"""
+        try:
+            trans = json.loads(transformation_info)
+        except:
+            trans = {}
+        size_select = int(size_select)
+        # import pdb; pdb.set_trace()
+        r, g, b, ori_a = foreground_img.split()
+        fg_img_scale, fg_img = self.transform_foreground_original(foreground_img, background_img, trans)
+        new_r, new_g, new_b, new_a = fg_img_scale.split()
+        foreground_img_scale = Image.merge("RGB", (new_r, new_g, new_b))
+        r, g, b, ori_a = fg_img.split()
+        foreground_img = Image.merge("RGB", (r, g, b))
+        foreground_img_save = foreground_img.copy()
+        ori_a = ori_a.convert("L")
+        new_a = new_a.convert("L")
+        foreground_img.paste((255, 255, 255), mask=ImageOps.invert(ori_a))
+        print("0"*10)
+        print(foreground_img.size)
+        print(background_img.size)
+        images = self.model_generate(foreground_img.copy(), background_img.copy(),
+                        ori_a, new_a,
+                        enable_mask_affine=enable_gui,
+                        prompt=prompt,
+                        offset_cond=[0, 1, 0] if not enable_gui else None,
+                        seed=seed,
+                        cfg=cfg,
+                        size_select=size_select,
+                        text_strength=text_strength,
+                        truecfg=truecfg)
+        images = Image.fromarray(images[0], "RGB")
+        images = images.resize(background_img.size)
+        images_save = images.copy()
+        images.thumbnail((640, 640), Image.LANCZOS)
+        return images
+    @torch.inference_mode()
+    def model_generate(self, fg_image, bg_image, ori_fg_mask, new_fg_mask, enable_mask_affine=True, prompt="", offset_cond=None, seed=None, cfg=3.5, size_select=1024, text_strength=1, truecfg=False):
+        batch_size = 1
+        print("-3"*10)
+        # Prepare images
+        # adjust bg->fg size
+        fg_image, ori_fg_mask = adjust_fg_to_bg(fg_image, ori_fg_mask, bg_image.size)
+        bucket_size = find_nearest_bucket_size(bg_image.size[0], bg_image.size[1], bucket_size=size_select)
+        fg_image = fg_image.resize(bucket_size)
+        bg_image = bg_image.resize(bucket_size)
+        mask_affine = None
+        if enable_mask_affine:
+            ori_fg_mask = ori_fg_mask.resize(bucket_size)
+            new_fg_mask = new_fg_mask.resize(bucket_size)
+            mask_affine = get_mask_affine(new_fg_mask, ori_fg_mask)
+        print("-2"*10)
+        # Get embeddings
+        prompt_embeds, pooled_prompt_embeds, text_ids = self._compute_text_embeddings(prompt)
+        prompt_embeds = prompt_embeds.repeat(1, text_strength, 1)
+        text_ids = text_ids.repeat(text_strength, 1)
+        # Prepare
+        if self.config.model_choice == "dev":
+            guidance = torch.full([1], cfg, device=self.device, dtype=torch.float32)
+            guidance = guidance.expand(batch_size)
+        else:
+            guidance = None
+        # Prepare generator
+        if seed is None:
+            seed = self.config.seed
+        generator = torch.Generator(device=self.device).manual_seed(seed)
+        print("-1"*10)
+        # Prepare condition latents
+        condition_image_latents = self._encode_images([fg_image, bg_image])
+        if offset_cond is None:
+            offset_cond = self.config.image_ids_offset
+        offset_cond = offset_cond[1:]
+        cond_latent_image_ids = []
+        for offset_ in offset_cond:
+            cond_latent_image_ids.append(
+                self._prepare_image_ids(
+                    condition_image_latents.shape[2] // 2,
+                    condition_image_latents.shape[3] // 2,
+                    offset_w=offset_ * condition_image_latents.shape[3] // 2
+                )
+            )
+        print(1)
+        if mask_affine is not None:
+            affine_H, affine_W = condition_image_latents.shape[2] // 2, condition_image_latents.shape[3] // 2
+            scale_factor = 1 / 16
+            cond_latent_image_ids_fg = cond_latent_image_ids[0].reshape(affine_H, affine_W, 3).clone()
+            # opt 1
+            cond_latent_image_ids[0] = warp_affine_tensor(
+                cond_latent_image_ids_fg, mask_affine, output_size=(affine_H, affine_W),
+                scale_factor=scale_factor, device=self.device,
+            )
+        cond_latent_image_ids = torch.stack(cond_latent_image_ids)
+        print(2)
+        # Pack condition latents
+        cond_image_latents = self._pack_latents(condition_image_latents)
+        cond_input = {
+            "image_latents": cond_image_latents,
+            "image_ids": cond_latent_image_ids,
+        }
+        # Prepare initial latents
+        width, height = bucket_size
+        num_channels_latents = self.denoise_model.config.in_channels // 4
+        latents, latent_image_ids = self._prepare_latents(
+            batch_size, num_channels_latents, height, width, generator
+        )
+        print(3)
+        # Setup timesteps
+        sigmas = np.linspace(1.0, 1 / self.config.num_inference_steps, self.config.num_inference_steps)
+        image_seq_len = latents.shape[1]
+        mu = calculate_shift(
+            image_seq_len,
+            self.scheduler.config.base_image_seq_len,
+            self.scheduler.config.max_image_seq_len,
+            self.scheduler.config.base_shift,
+            self.scheduler.config.max_shift,
+        )
+        timesteps, num_inference_steps = retrieve_timesteps(
+            self.scheduler,
+            self.config.num_inference_steps,
+            self.device,
+            sigmas=sigmas,
+            mu=mu,
+        )
+        print(4)
+        # Denoising loop
+        for i, t in enumerate(timesteps):
+            timestep = t.expand(latents.shape[0]).to(latents.dtype)
+            with torch.autocast(enabled=True, device_type="cuda", dtype=self.config.dtype):
+                noise_pred = self.denoise_model(
+                    hidden_states=latents,
+                    cond_input=cond_input,
+                    timestep=timestep / 1000,
+                    guidance=guidance,
+                    pooled_projections=pooled_prompt_embeds,
+                    encoder_hidden_states=prompt_embeds,
+                    txt_ids=text_ids,
+                    img_ids=latent_image_ids,
+                    data_num_per_group=batch_size,
+                    image_tags=self.config.image_tags,
+                    context_tags=self.config.context_tags,
+                    max_sequence_length=self.config.max_sequence_length,
+                    mix_attention_double=self.config.mix_attention_double,
+                    mix_attention_single=self.config.mix_attention_single,
+                    joint_attention_kwargs=None,
+                    return_dict=False,
+                )[0]
+                if truecfg and i >= 1:
+                    guidance_neg = torch.full([1], 1, device=self.device, dtype=torch.float32)
+                    guidance_neg = guidance_neg.expand(batch_size)
+                    noise_pred_neg = self.denoise_model(
+                        hidden_states=latents,
+                        cond_input=cond_input,
+                        timestep=timestep / 1000,
+                        guidance=guidance,
+                        pooled_projections=pooled_prompt_embeds,
+                        encoder_hidden_states=prompt_embeds,
+                        txt_ids=text_ids,
+                        img_ids=latent_image_ids,
+                        data_num_per_group=batch_size,
+                        image_tags=self.config.image_tags,
+                        context_tags=self.config.context_tags,
+                        max_sequence_length=self.config.max_sequence_length,
+                        mix_attention_double=self.config.mix_attention_double,
+                        mix_attention_single=self.config.mix_attention_single,
+                        joint_attention_kwargs=None,
+                        return_dict=False,
+                    )[0]
+                    noise_pred = noise_pred_neg + 5 * (noise_pred - noise_pred_neg)
+            # Compute previous noisy sample
+            latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
+        print(5)
+        # Decode latents
+        latents = self._unpack_latents(latents, height, width)
+        latents = (latents / self.vae.config.scaling_factor) + self.vae.config.shift_factor
+        images = self.vae.decode(latents, return_dict=False)[0]
+        print(6)
+        # Post-process images
+        images = images.add(1).mul(127.5).clamp(0, 255).to(torch.uint8).permute(0, 2, 3, 1).cpu().numpy()
+        return images
+    def _encode_images(self, images):
+        return encode_images_cond(self.vae, [images], self.device)
+    def _prepare_image_ids(self, h, w, offset_w=0):
+        return _prepare_image_ids(h, w, offset_w=offset_w).to(self.device)
+    def _pack_latents(self, latents):
+        b, c, h, w = latents.shape
+        return _pack_latents(latents, b, c, h, w)
+    def _unpack_latents(self, latents, height, width):
+        vae_scale = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        return _unpack_latents(latents, height, width, vae_scale)
+    def _prepare_latents(self, batch_size, num_channels_latents, height, width, generator):
+        vae_scale = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        latents, latent_image_ids = prepare_latents(
+            batch_size=batch_size,
+            num_channels_latents=num_channels_latents,
+            vae_downsample_factor=vae_scale,
+            height=height,
+            width=width,
+            dtype=self.config.dtype,
+            device=self.device,
+            generator=generator,
+            offset=None
+        )
+        return latents, latent_image_ids
+def main():
+    parser = transformers.HfArgumentParser(InferenceConfig)
+    config: InferenceConfig = parser.parse_args_into_dataclasses()[0]
+    model = DreamFuseInference(config)
+    os.makedirs(config.valid_output_dir, exist_ok=True)
+    for valid_root, valid_json in zip(config.valid_roots, config.valid_jsons):
+        with open(valid_json, 'r') as f:
+            valid_info = json.load(f)
+        # multi gpu
+        to_process = sorted(list(valid_info.keys()))
+        # debug
+        to_process = [k for k in to_process if "data_wear" in k and "pixelwave" in k]
+        # debug
+        sd_idx = len(to_process) // config.total_num * config.sub_idx
+        ed_idx = len(to_process) // config.total_num * (config.sub_idx + 1)
+        if config.sub_idx < config.total_num - 1:
+            print(config.sub_idx, sd_idx, ed_idx)
+            to_process = to_process[sd_idx:ed_idx]
+        else:
+            print(config.sub_idx, sd_idx)
+            to_process = to_process[sd_idx:]
+        valid_info = {k: valid_info[k] for k in to_process}
+        for meta_key, info in tqdm(valid_info.items()):
+            img_name = meta_key.split('/')[-1]
+            foreground_img = Image.open(os.path.join(valid_root, info['img_info']['000']))
+            background_img = Image.open(os.path.join(valid_root, info['img_info']['001']))
+            new_fg_mask = Image.open(os.path.join(valid_root, info['img_mask_info']['000_mask_scale']))
+            ori_fg_mask = Image.open(os.path.join(valid_root, info['img_mask_info']['000']))
+            # debug
+            foreground_img.save(os.path.join(config.valid_output_dir, f"{img_name}_0.png"))
+            background_img.save(os.path.join(config.valid_output_dir, f"{img_name}_1.png"))
+            ori_fg_mask.save(os.path.join(config.valid_output_dir, f"{img_name}_0_mask.png"))
+            new_fg_mask.save(os.path.join(config.valid_output_dir, f"{img_name}_0_mask_scale.png"))
+            # debug
+            foreground_img.paste((255, 255, 255), mask=ImageOps.invert(ori_fg_mask))
+            images = model(foreground_img.copy(), background_img.copy(),
+                ori_fg_mask, new_fg_mask,
+                prompt=config.ref_prompts,
+                seed=config.seed,
+                cfg=config.guidance_scale,
+                size_select=config.inference_scale,
+                text_strength=config.text_strength,
+                truecfg=config.truecfg)
+            result_image = Image.fromarray(images[0], "RGB")
+            result_image = result_image.resize(background_img.size)
+            result_image.save(os.path.join(config.valid_output_dir, f"{img_name}_2.png"))
+            # Make grid
+            grid_image = [foreground_img, background_img] + [result_image]
+            result = make_image_grid(grid_image, 1, len(grid_image), size=result_image.size)
+            result.save(os.path.join(config.valid_output_dir, f"{img_name}.jpg"))
+if __name__ == "__main__":
+    main()

examples/9_01.png ADDED Viewed

Git LFS Details

SHA256: 5e41295df53b3ee903a93034a0de0118570ba5f5d7e6dd08439592cc377cf672
Pointer size: 131 Bytes
Size of remote file: 383 kB

examples/9_02.png ADDED Viewed

Git LFS Details

SHA256: cdfa95db79005c7015afc9d557c5eebfcb4a83f4e43b9d0439241ea00a8888d8
Pointer size: 132 Bytes
Size of remote file: 1.87 MB

output_images/no_bg_image.png ADDED Viewed

Git LFS Details

SHA256: 339a905140e7f12e9443ce8acc5455b89c198a6ddda6e94f322797fedb2c04c8
Pointer size: 131 Bytes
Size of remote file: 496 kB

requirements.txt ADDED Viewed

	@@ -0,0 +1,37 @@

+lmdb==1.4.1
+tqdm==4.66.1
+datasets
+tensorboardX
+accelerate
+ninja
+transformers==4.46.3
+pycocotools==2.0.7
+scikit-image
+Pillow==9.5.0
+opencv-python
+opencv-python-headless
+datasets
+einops==0.8.0
+sentencepiece
+pydantic==2.9.2
+deepspeed
+peft==0.14.0
+diffusers==0.32.0
+rotary-embedding-torch==0.8.4
+tiktoken==0.8.0
+transformers_stream_generator==0.0.5
+ftfy
+bs4
+bson==0.5.10
+gradio==5.12.0
+httpx
+fairscale==0.4.13
+kornia
+timm==1.0.9
+protobuf==3.20.0
+basicsr
+sentencepiece
+huggingface_hub
+prodigyopt
+torch==2.4.0
+torchvision==0.19.0