Spaces:
Running
on
Zero
Running
on
Zero
Commit
·
f96f677
0
Parent(s):
test
Browse files- .gitattributes +36 -0
- README.md +13 -0
- __pycache__/dreamfuse_inference.cpython-310.pyc +0 -0
- app.py +491 -0
- dreamfuse/.DS_Store +0 -0
- dreamfuse/models/dreamfuse_flux/__pycache__/flux_processor.cpython-310.pyc +0 -0
- dreamfuse/models/dreamfuse_flux/__pycache__/transformer.cpython-310.pyc +0 -0
- dreamfuse/models/dreamfuse_flux/flux_processor.py +269 -0
- dreamfuse/models/dreamfuse_flux/transformer.py +866 -0
- dreamfuse/trains/utils/__pycache__/inference_utils.cpython-310.pyc +0 -0
- dreamfuse/trains/utils/inference_utils.py +386 -0
- dreamfuse_inference.py +642 -0
- examples/9_01.png +3 -0
- examples/9_02.png +3 -0
- output_images/no_bg_image.png +3 -0
- requirements.txt +37 -0
.gitattributes
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
*.png filter=lfs diff=lfs merge=lfs -text
|
README.md
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: DreamFuse
|
3 |
+
emoji: 📚
|
4 |
+
colorFrom: indigo
|
5 |
+
colorTo: blue
|
6 |
+
sdk: gradio
|
7 |
+
sdk_version: 5.24.0
|
8 |
+
app_file: app.py
|
9 |
+
pinned: false
|
10 |
+
license: mit
|
11 |
+
---
|
12 |
+
|
13 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
__pycache__/dreamfuse_inference.cpython-310.pyc
ADDED
Binary file (14.1 kB). View file
|
|
app.py
ADDED
@@ -0,0 +1,491 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import spaces
|
3 |
+
from PIL import Image, ImageDraw, ImageOps
|
4 |
+
import base64, json
|
5 |
+
from io import BytesIO
|
6 |
+
import torch.nn.functional as F
|
7 |
+
import json
|
8 |
+
from typing import List
|
9 |
+
from dataclasses import dataclass, field
|
10 |
+
from dreamfuse_inference import DreamFuseInference, InferenceConfig
|
11 |
+
import numpy as np
|
12 |
+
import os
|
13 |
+
from transformers import AutoModelForImageSegmentation
|
14 |
+
from torchvision import transforms
|
15 |
+
import torch
|
16 |
+
import subprocess
|
17 |
+
subprocess.run("rm -rf /data-nvme/zerogpu-offload/*", env={}, shell=True)
|
18 |
+
generated_images = []
|
19 |
+
|
20 |
+
|
21 |
+
RMBG_model = AutoModelForImageSegmentation.from_pretrained('briaai/RMBG-2.0', trust_remote_code=True)
|
22 |
+
RMBG_model = RMBG_model.to("cuda")
|
23 |
+
transform = transforms.Compose([
|
24 |
+
transforms.Resize((1024, 1024)),
|
25 |
+
transforms.ToTensor(),
|
26 |
+
transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
|
27 |
+
])
|
28 |
+
|
29 |
+
@spaces.GPU
|
30 |
+
def remove_bg(image):
|
31 |
+
im = image.convert("RGB")
|
32 |
+
input_tensor = transform(im).unsqueeze(0).to("cuda")
|
33 |
+
with torch.no_grad():
|
34 |
+
preds = RMBG_model(input_tensor)[-1].sigmoid().cpu()[0].squeeze()
|
35 |
+
mask = transforms.ToPILImage()(preds).resize(im.size)
|
36 |
+
return mask
|
37 |
+
|
38 |
+
class DreamblendGUI:
|
39 |
+
def __init__(self):
|
40 |
+
self.examples = [
|
41 |
+
["./examples/9_02.png",
|
42 |
+
"./examples/9_01.png"],
|
43 |
+
]
|
44 |
+
self.examples = [[Image.open(x) for x in example] for example in self.examples]
|
45 |
+
self.css_style = self._get_css_style()
|
46 |
+
self.js_script = self._get_js_script()
|
47 |
+
|
48 |
+
def _get_css_style(self):
|
49 |
+
return """
|
50 |
+
body {
|
51 |
+
background: transparent;
|
52 |
+
font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
|
53 |
+
color: #fff;
|
54 |
+
}
|
55 |
+
.gradio-container {
|
56 |
+
max-width: 1200px;
|
57 |
+
margin: auto;
|
58 |
+
background: transparent;
|
59 |
+
border-radius: 10px;
|
60 |
+
padding: 20px;
|
61 |
+
box-shadow: 0px 2px 8px rgba(255,255,255,0.1);
|
62 |
+
}
|
63 |
+
h1, h2 {
|
64 |
+
text-align: center;
|
65 |
+
color: #fff;
|
66 |
+
}
|
67 |
+
#canvas_preview {
|
68 |
+
border: 2px dashed rgba(255,255,255,0.5);
|
69 |
+
padding: 10px;
|
70 |
+
background: transparent;
|
71 |
+
border-radius: 8px;
|
72 |
+
}
|
73 |
+
.gr-button {
|
74 |
+
background-color: #007bff;
|
75 |
+
border: none;
|
76 |
+
color: #fff;
|
77 |
+
padding: 10px 20px;
|
78 |
+
border-radius: 5px;
|
79 |
+
font-size: 16px;
|
80 |
+
cursor: pointer;
|
81 |
+
}
|
82 |
+
.gr-button:hover {
|
83 |
+
background-color: #0056b3;
|
84 |
+
}
|
85 |
+
#small-examples {
|
86 |
+
max-width: 200px !important;
|
87 |
+
width: 200px !important;
|
88 |
+
float: left;
|
89 |
+
margin-right: 20px;
|
90 |
+
}
|
91 |
+
"""
|
92 |
+
|
93 |
+
def _get_js_script(self):
|
94 |
+
return r"""
|
95 |
+
async () => {
|
96 |
+
window.updateTransformation = function() {
|
97 |
+
const img = document.getElementById('draggable-img');
|
98 |
+
const container = document.getElementById('canvas-container');
|
99 |
+
if (!img || !container) return;
|
100 |
+
const left = parseFloat(img.style.left) || 0;
|
101 |
+
const top = parseFloat(img.style.top) || 0;
|
102 |
+
|
103 |
+
const canvasSize = 400;
|
104 |
+
const data_original_width = parseFloat(img.getAttribute('data-original-width'));
|
105 |
+
const data_original_height = parseFloat(img.getAttribute('data-original-height'));
|
106 |
+
const bgWidth = parseFloat(container.dataset.bgWidth);
|
107 |
+
const bgHeight = parseFloat(container.dataset.bgHeight);
|
108 |
+
const scale_ratio = img.clientWidth / data_original_width;
|
109 |
+
|
110 |
+
const transformation = {
|
111 |
+
drag_left: left,
|
112 |
+
drag_top: top,
|
113 |
+
drag_width: img.clientWidth,
|
114 |
+
drag_height: img.clientHeight,
|
115 |
+
data_original_width: data_original_width,
|
116 |
+
data_original_height: data_original_height,
|
117 |
+
scale_ratio: scale_ratio
|
118 |
+
};
|
119 |
+
|
120 |
+
const transInput = document.querySelector("#transformation_info textarea");
|
121 |
+
if(transInput){
|
122 |
+
const newValue = JSON.stringify(transformation);
|
123 |
+
const nativeSetter = Object.getOwnPropertyDescriptor(window.HTMLTextAreaElement.prototype, 'value').set;
|
124 |
+
nativeSetter.call(transInput, newValue);
|
125 |
+
transInput.dispatchEvent(new Event('input', { bubbles: true }));
|
126 |
+
console.log("Transformation info updated: ", newValue);
|
127 |
+
} else {
|
128 |
+
console.log("找不到 transformation_info 的 textarea 元素");
|
129 |
+
}
|
130 |
+
};
|
131 |
+
|
132 |
+
globalThis.initializeDrag = () => {
|
133 |
+
console.log("初始化拖拽与缩放功能...");
|
134 |
+
const observer = new MutationObserver(() => {
|
135 |
+
const img = document.getElementById('draggable-img');
|
136 |
+
const container = document.getElementById('canvas-container');
|
137 |
+
const slider = document.getElementById('scale-slider');
|
138 |
+
if (img && container && slider) {
|
139 |
+
observer.disconnect();
|
140 |
+
console.log("绑定拖拽与缩放事件...");
|
141 |
+
img.ondragstart = (e) => { e.preventDefault(); return false; };
|
142 |
+
let offsetX = 0, offsetY = 0;
|
143 |
+
let isDragging = false;
|
144 |
+
let scaleAnchor = null;
|
145 |
+
|
146 |
+
img.addEventListener('mousedown', (e) => {
|
147 |
+
isDragging = true;
|
148 |
+
img.style.cursor = 'grabbing';
|
149 |
+
const imgRect = img.getBoundingClientRect();
|
150 |
+
offsetX = e.clientX - imgRect.left;
|
151 |
+
offsetY = e.clientY - imgRect.top;
|
152 |
+
img.style.transform = "none";
|
153 |
+
img.style.left = img.offsetLeft + "px";
|
154 |
+
img.style.top = img.offsetTop + "px";
|
155 |
+
console.log("mousedown: left=", img.style.left, "top=", img.style.top);
|
156 |
+
});
|
157 |
+
document.addEventListener('mousemove', (e) => {
|
158 |
+
if (!isDragging) return;
|
159 |
+
e.preventDefault();
|
160 |
+
|
161 |
+
const containerRect = container.getBoundingClientRect();
|
162 |
+
// 计算当前拖拽后的坐标(基于容器)
|
163 |
+
let left = e.clientX - containerRect.left - offsetX;
|
164 |
+
let top = e.clientY - containerRect.top - offsetY;
|
165 |
+
|
166 |
+
// 允许的拖拽范围:
|
167 |
+
// 水平方向允许最少超出图像一半:最小值为 -img.clientWidth * (7/8)
|
168 |
+
// 水平方向允许最多超出一半:最大值为 containerRect.width - img.clientWidth * (1/8)
|
169 |
+
const minLeft = -img.clientWidth * (7/8);
|
170 |
+
const maxLeft = containerRect.width - img.clientWidth * (1/8);
|
171 |
+
|
172 |
+
// 垂直方向允许范围:
|
173 |
+
// 最小值为 -img.clientHeight * (7/8)
|
174 |
+
// 最大值为 containerRect.height - img.clientHeight * (1/8)
|
175 |
+
const minTop = -img.clientHeight * (7/8);
|
176 |
+
const maxTop = containerRect.height - img.clientHeight * (1/8);
|
177 |
+
|
178 |
+
// 限制范围
|
179 |
+
if (left < minLeft) left = minLeft;
|
180 |
+
if (left > maxLeft) left = maxLeft;
|
181 |
+
|
182 |
+
if (top < minTop) top = minTop;
|
183 |
+
if (top > maxTop) top = maxTop;
|
184 |
+
|
185 |
+
img.style.left = left + "px";
|
186 |
+
img.style.top = top + "px";
|
187 |
+
});
|
188 |
+
|
189 |
+
window.addEventListener('mouseup', (e) => {
|
190 |
+
if (isDragging) {
|
191 |
+
isDragging = false;
|
192 |
+
img.style.cursor = 'grab';
|
193 |
+
const containerRect = container.getBoundingClientRect();
|
194 |
+
const bgWidth = parseFloat(container.dataset.bgWidth);
|
195 |
+
const bgHeight = parseFloat(container.dataset.bgHeight);
|
196 |
+
const offsetLeft = (containerRect.width - bgWidth) / 2;
|
197 |
+
const offsetTop = (containerRect.height - bgHeight) / 2;
|
198 |
+
const absoluteLeft = parseFloat(img.style.left);
|
199 |
+
const absoluteTop = parseFloat(img.style.top);
|
200 |
+
const relativeX = absoluteLeft - offsetLeft;
|
201 |
+
const relativeY = absoluteTop - offsetTop;
|
202 |
+
document.getElementById("coordinate").textContent =
|
203 |
+
`前景图坐标: (x=${relativeX.toFixed(2)}, y=${relativeY.toFixed(2)})`;
|
204 |
+
updateTransformation();
|
205 |
+
}
|
206 |
+
scaleAnchor = null;
|
207 |
+
});
|
208 |
+
|
209 |
+
slider.addEventListener('mousedown', (e) => {
|
210 |
+
const containerRect = container.getBoundingClientRect();
|
211 |
+
const imgRect = img.getBoundingClientRect();
|
212 |
+
scaleAnchor = {
|
213 |
+
x: imgRect.left + imgRect.width/2 - containerRect.left,
|
214 |
+
y: imgRect.top + imgRect.height/2 - containerRect.top
|
215 |
+
};
|
216 |
+
console.log("Slider mousedown, captured scaleAnchor: ", scaleAnchor);
|
217 |
+
});
|
218 |
+
|
219 |
+
slider.addEventListener('input', (e) => {
|
220 |
+
const scale = parseFloat(e.target.value);
|
221 |
+
const originalWidth = parseFloat(img.getAttribute('data-original-width'));
|
222 |
+
const originalHeight = parseFloat(img.getAttribute('data-original-height'));
|
223 |
+
const newWidth = originalWidth * scale;
|
224 |
+
const newHeight = originalHeight * scale;
|
225 |
+
const containerRect = container.getBoundingClientRect();
|
226 |
+
let centerX, centerY;
|
227 |
+
if (scaleAnchor) {
|
228 |
+
centerX = scaleAnchor.x;
|
229 |
+
centerY = scaleAnchor.y;
|
230 |
+
} else {
|
231 |
+
const imgRect = img.getBoundingClientRect();
|
232 |
+
centerX = imgRect.left + imgRect.width/2 - containerRect.left;
|
233 |
+
centerY = imgRect.top + imgRect.height/2 - containerRect.top;
|
234 |
+
}
|
235 |
+
const newLeft = centerX - newWidth/2;
|
236 |
+
const newTop = centerY - newHeight/2;
|
237 |
+
img.style.width = newWidth + "px";
|
238 |
+
img.style.height = newHeight + "px";
|
239 |
+
img.style.left = newLeft + "px";
|
240 |
+
img.style.top = newTop + "px";
|
241 |
+
console.log("slider: scale=", scale, "newWidth=", newWidth, "newHeight=", newHeight);
|
242 |
+
updateTransformation();
|
243 |
+
});
|
244 |
+
|
245 |
+
slider.addEventListener('mouseup', (e) => {
|
246 |
+
scaleAnchor = null;
|
247 |
+
});
|
248 |
+
}
|
249 |
+
});
|
250 |
+
observer.observe(document.body, { childList: true, subtree: true });
|
251 |
+
};
|
252 |
+
}
|
253 |
+
"""
|
254 |
+
|
255 |
+
|
256 |
+
def get_next_sequence(self, folder_path):
|
257 |
+
# 列出文件夹中的所有文件名
|
258 |
+
filenames = os.listdir(folder_path)
|
259 |
+
# 提取文件名中的序列号部分(假设是前三位数字)
|
260 |
+
sequences = [int(name.split('_')[0]) for name in filenames if name.split('_')[0].isdigit()]
|
261 |
+
# 找到最大序列号
|
262 |
+
max_sequence = max(sequences, default=-1)
|
263 |
+
# 返回下一位序列号,格式为三位数字(如002)
|
264 |
+
return f"{max_sequence + 1:03d}"
|
265 |
+
|
266 |
+
|
267 |
+
def pil_to_base64(self, img):
|
268 |
+
"""将 PIL Image 转为 base64 字符串,PNG 格式下保留透明通道"""
|
269 |
+
if img is None:
|
270 |
+
return ""
|
271 |
+
if img.mode != "RGBA":
|
272 |
+
img = img.convert("RGBA")
|
273 |
+
buffered = BytesIO()
|
274 |
+
img.save(buffered, format="PNG", optimize=True)
|
275 |
+
img_bytes = buffered.getvalue()
|
276 |
+
base64_str = base64.b64encode(img_bytes).decode()
|
277 |
+
return f"data:image/png;base64,{base64_str}"
|
278 |
+
|
279 |
+
def resize_background_image(self, img, max_size=400):
|
280 |
+
"""将背景图等比例缩放到最长边为 max_size(400)"""
|
281 |
+
if img is None:
|
282 |
+
return None
|
283 |
+
w, h = img.size
|
284 |
+
if w > max_size or h > max_size:
|
285 |
+
ratio = min(max_size / w, max_size / h)
|
286 |
+
new_w, new_h = int(w * ratio), int(h * ratio)
|
287 |
+
img = img.resize((new_w, new_h), Image.LANCZOS)
|
288 |
+
return img
|
289 |
+
|
290 |
+
def resize_draggable_image(self, img, max_size=400):
|
291 |
+
"""将前景图等比例缩放到最长边不超过 max_size(400)"""
|
292 |
+
if img is None:
|
293 |
+
return None
|
294 |
+
w, h = img.size
|
295 |
+
if w > max_size or h > max_size:
|
296 |
+
ratio = min(max_size / w, max_size / h)
|
297 |
+
new_w, new_h = int(w * ratio), int(h * ratio)
|
298 |
+
img = img.resize((new_w, new_h), Image.LANCZOS)
|
299 |
+
return img
|
300 |
+
|
301 |
+
def generate_html(self, background_img_b64, bg_width, bg_height, draggable_img_b64, draggable_width, draggable_height, canvas_size=400):
|
302 |
+
"""生成预览 HTML 页面"""
|
303 |
+
html_code = f"""
|
304 |
+
<html>
|
305 |
+
<head>
|
306 |
+
<style>
|
307 |
+
body {{
|
308 |
+
margin: 0;
|
309 |
+
padding: 0;
|
310 |
+
text-align: center;
|
311 |
+
font-family: sans-serif;
|
312 |
+
background: transparent;
|
313 |
+
color: #fff;
|
314 |
+
}}
|
315 |
+
h2 {{
|
316 |
+
margin-top: 1rem;
|
317 |
+
}}
|
318 |
+
#scale-control {{
|
319 |
+
margin: 1rem auto;
|
320 |
+
width: 400px;
|
321 |
+
text-align: left;
|
322 |
+
}}
|
323 |
+
#scale-control label {{
|
324 |
+
font-size: 1rem;
|
325 |
+
margin-right: 0.5rem;
|
326 |
+
}}
|
327 |
+
#canvas-container {{
|
328 |
+
position: relative;
|
329 |
+
width: {canvas_size}px;
|
330 |
+
height: {canvas_size}px;
|
331 |
+
margin: 0 auto;
|
332 |
+
border: 1px dashed rgba(255,255,255,0.5);
|
333 |
+
overflow: hidden;
|
334 |
+
background-image: url('{background_img_b64}');
|
335 |
+
background-repeat: no-repeat;
|
336 |
+
background-position: center;
|
337 |
+
background-size: contain;
|
338 |
+
border-radius: 8px;
|
339 |
+
}}
|
340 |
+
#draggable-img {{
|
341 |
+
position: absolute;
|
342 |
+
cursor: grab;
|
343 |
+
left: 50%;
|
344 |
+
top: 50%;
|
345 |
+
transform: translate(-50%, -50%);
|
346 |
+
background-color: transparent;
|
347 |
+
}}
|
348 |
+
#coordinate {{
|
349 |
+
color: #fff;
|
350 |
+
margin-top: 1rem;
|
351 |
+
font-weight: bold;
|
352 |
+
}}
|
353 |
+
</style>
|
354 |
+
</head>
|
355 |
+
<body>
|
356 |
+
<h2>拖拽前景图(支持缩放)</h2>
|
357 |
+
<div id="scale-control">
|
358 |
+
<label for="scale-slider">前景图缩放:</label>
|
359 |
+
<input type="range" id="scale-slider" min="0.1" max="2" step="0.01" value="1">
|
360 |
+
</div>
|
361 |
+
<div id="canvas-container" data-bg-width="{bg_width}" data-bg-height="{bg_height}">
|
362 |
+
<img id="draggable-img"
|
363 |
+
src="{draggable_img_b64}"
|
364 |
+
alt="Draggable Image"
|
365 |
+
draggable="false"
|
366 |
+
data-original-width="{draggable_width}"
|
367 |
+
data-original-height="{draggable_height}"
|
368 |
+
/>
|
369 |
+
</div>
|
370 |
+
<p id="coordinate">前景图坐标: (x=?, y=?)</p>
|
371 |
+
</body>
|
372 |
+
</html>
|
373 |
+
"""
|
374 |
+
return html_code
|
375 |
+
|
376 |
+
def on_upload(self, background_img, draggable_img):
|
377 |
+
"""上传图片后的处理"""
|
378 |
+
if background_img is None or draggable_img is None:
|
379 |
+
return "<p style='color:red;'>请先上传背景图片和可拖拽图片。</p>"
|
380 |
+
|
381 |
+
if draggable_img.mode != "RGB":
|
382 |
+
draggable_img = draggable_img.convert("RGB")
|
383 |
+
draggable_img_mask = remove_bg(draggable_img)
|
384 |
+
alpha_channel = draggable_img_mask.convert("L")
|
385 |
+
draggable_img = draggable_img.convert("RGBA")
|
386 |
+
draggable_img.putalpha(alpha_channel)
|
387 |
+
|
388 |
+
resized_bg = self.resize_background_image(background_img, max_size=400)
|
389 |
+
bg_w, bg_h = resized_bg.size
|
390 |
+
|
391 |
+
resized_fg = self.resize_draggable_image(draggable_img, max_size=400)
|
392 |
+
draggable_width, draggable_height = resized_fg.size
|
393 |
+
|
394 |
+
background_img_b64 = self.pil_to_base64(resized_bg)
|
395 |
+
draggable_img_b64 = self.pil_to_base64(resized_fg)
|
396 |
+
|
397 |
+
return self.generate_html(
|
398 |
+
background_img_b64, bg_w, bg_h,
|
399 |
+
draggable_img_b64, draggable_width, draggable_height,
|
400 |
+
canvas_size=400
|
401 |
+
), draggable_img
|
402 |
+
|
403 |
+
def save_image(self, save_path = "/mnt/bn/hjj-humanseg-lq/SubjectDriven/DreamFuse/debug"):
|
404 |
+
global generated_images
|
405 |
+
save_name = self.get_next_sequence(save_path)
|
406 |
+
generated_images[0].save(os.path.join(save_path, f"{save_name}_0_ori.png"))
|
407 |
+
generated_images[1].save(os.path.join(save_path, f"{save_name}_0.png"))
|
408 |
+
generated_images[2].save(os.path.join(save_path, f"{save_name}_1.png"))
|
409 |
+
generated_images[3].save(os.path.join(save_path, f"{save_name}_2.png"))
|
410 |
+
generated_images[4].save(os.path.join(save_path, f"{save_name}_0_mask.png"))
|
411 |
+
generated_images[5].save(os.path.join(save_path, f"{save_name}_0_mask_scale.png"))
|
412 |
+
generated_images[6].save(os.path.join(save_path, f"{save_name}_0_scale.png"))
|
413 |
+
generated_images[7].save(os.path.join(save_path, f"{save_name}_2_pasted.png"))
|
414 |
+
|
415 |
+
|
416 |
+
def create_gui(self):
|
417 |
+
config = InferenceConfig()
|
418 |
+
config.lora_id = 'LL3RD/DreamFuse'
|
419 |
+
|
420 |
+
pipeline = DreamFuseInference(config)
|
421 |
+
pipeline.gradio_generate = spaces.GPU(duratioin=120)(pipeline.gradio_generate)
|
422 |
+
"""创建 Gradio 界面"""
|
423 |
+
with gr.Blocks(css=self.css_style) as demo:
|
424 |
+
modified_fg_state = gr.State()
|
425 |
+
gr.Markdown("# Dreamblend-GUI-dirtydata")
|
426 |
+
gr.Markdown("通过上传背景图与前景图生成带有可拖拽/缩放预览的合成图像,同时支持 Seed 设置和 Prompt 文本输入。")
|
427 |
+
with gr.Row():
|
428 |
+
with gr.Column(scale=1):
|
429 |
+
gr.Markdown("### 上传图片")
|
430 |
+
background_img_in = gr.Image(label="背景图片", type="pil", height=240, width=240)
|
431 |
+
draggable_img_in = gr.Image(label="前景图片", type="pil", image_mode="RGBA", height=240, width=240)
|
432 |
+
generate_btn = gr.Button("生成可拖拽画布")
|
433 |
+
|
434 |
+
with gr.Row():
|
435 |
+
gr.Examples(
|
436 |
+
examples=[self.examples[0]],
|
437 |
+
inputs=[background_img_in, draggable_img_in],
|
438 |
+
elem_id="small-examples"
|
439 |
+
)
|
440 |
+
with gr.Column(scale=1):
|
441 |
+
gr.Markdown("### 预览区域")
|
442 |
+
html_out = gr.HTML(label="预览与拖拽", elem_id="canvas_preview")
|
443 |
+
|
444 |
+
with gr.Row():
|
445 |
+
with gr.Column(scale=1):
|
446 |
+
gr.Markdown("### 参数设置")
|
447 |
+
seed_slider = gr.Slider(minimum=0, maximum=10000, step=1, label="Seed", value=42)
|
448 |
+
cfg_slider = gr.Slider(minimum=1, maximum=10, step=0.1, label="CFG", value=3.5)
|
449 |
+
size_select = gr.Radio(
|
450 |
+
choices=["512", "768", "1024"],
|
451 |
+
value="512",
|
452 |
+
label="生成质量(512-差 1024-好)",
|
453 |
+
)
|
454 |
+
prompt_text = gr.Textbox(label="Prompt", placeholder="输入文本提示", value="")
|
455 |
+
text_strength = gr.Slider(minimum=1, maximum=10, step=1, label="Text Strength", value=1)
|
456 |
+
enable_gui = gr.Checkbox(label="启用GUI", value=True)
|
457 |
+
enable_truecfg = gr.Checkbox(label="启用TrueCFG", value=False)
|
458 |
+
enable_save = gr.Button("保存图片 (内部测试)", visible=True)
|
459 |
+
with gr.Column(scale=1):
|
460 |
+
gr.Markdown("### 模型生成结果")
|
461 |
+
model_generate_btn = gr.Button("模型生成")
|
462 |
+
transformation_text = gr.Textbox(label="Transformation Info", elem_id="transformation_info", visible=False)
|
463 |
+
model_output = gr.Image(label="模型输出", type="pil")
|
464 |
+
|
465 |
+
# 交互事件绑定
|
466 |
+
enable_save.click(fn=self.save_image, inputs=None, outputs=None)
|
467 |
+
generate_btn.click(
|
468 |
+
fn=self.on_upload,
|
469 |
+
inputs=[background_img_in, draggable_img_in],
|
470 |
+
outputs=[html_out, modified_fg_state],
|
471 |
+
)
|
472 |
+
model_generate_btn.click(
|
473 |
+
fn=pipeline.gradio_generate,
|
474 |
+
inputs=[background_img_in, modified_fg_state, transformation_text, seed_slider, \
|
475 |
+
prompt_text, enable_gui, cfg_slider, size_select, text_strength, enable_truecfg],
|
476 |
+
outputs=model_output
|
477 |
+
)
|
478 |
+
# 页面加载后初始化拖拽/缩放事件
|
479 |
+
demo.load(None, None, None, js=self.js_script)
|
480 |
+
generate_btn.click(fn=None, inputs=None, outputs=None, js="initializeDrag")
|
481 |
+
|
482 |
+
return demo
|
483 |
+
|
484 |
+
if __name__ == "__main__":
|
485 |
+
|
486 |
+
gui = DreamblendGUI()
|
487 |
+
demo = gui.create_gui()
|
488 |
+
demo.queue()
|
489 |
+
demo.launch()
|
490 |
+
# demo.launch(server_port=7789, ssr_mode=False)
|
491 |
+
# demo.launch(server_name="[::]", share=True)
|
dreamfuse/.DS_Store
ADDED
Binary file (6.15 kB). View file
|
|
dreamfuse/models/dreamfuse_flux/__pycache__/flux_processor.cpython-310.pyc
ADDED
Binary file (7.61 kB). View file
|
|
dreamfuse/models/dreamfuse_flux/__pycache__/transformer.cpython-310.pyc
ADDED
Binary file (23.9 kB). View file
|
|
dreamfuse/models/dreamfuse_flux/flux_processor.py
ADDED
@@ -0,0 +1,269 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import inspect
|
2 |
+
import math
|
3 |
+
from typing import Callable, List, Optional, Tuple, Union
|
4 |
+
|
5 |
+
import torch
|
6 |
+
import torch.nn.functional as F
|
7 |
+
from torch import nn
|
8 |
+
|
9 |
+
from diffusers.image_processor import IPAdapterMaskProcessor
|
10 |
+
from diffusers.utils import deprecate, logging
|
11 |
+
from diffusers.utils.import_utils import is_torch_npu_available, is_xformers_available
|
12 |
+
from diffusers.utils.torch_utils import is_torch_version, maybe_allow_in_graph
|
13 |
+
from diffusers.models.attention import Attention
|
14 |
+
from diffusers.models.embeddings import Timesteps, TimestepEmbedding, PixArtAlphaTextProjection
|
15 |
+
|
16 |
+
class CombinedTimestepGuidanceTextProjEmbeddings(nn.Module):
|
17 |
+
def __init__(self, embedding_dim, pooled_projection_dim):
|
18 |
+
super().__init__()
|
19 |
+
|
20 |
+
self.time_proj = Timesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=0)
|
21 |
+
self.timestep_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=embedding_dim)
|
22 |
+
self.guidance_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=embedding_dim)
|
23 |
+
self.text_embedder = PixArtAlphaTextProjection(pooled_projection_dim, embedding_dim, act_fn="silu")
|
24 |
+
|
25 |
+
def forward(self, timestep, guidance, pooled_projection):
|
26 |
+
timesteps_proj = self.time_proj(timestep)
|
27 |
+
timesteps_emb = self.timestep_embedder(timesteps_proj.to(dtype=pooled_projection.dtype)) # (N, D)
|
28 |
+
|
29 |
+
if (guidance >= 0).all():
|
30 |
+
guidance_proj = self.time_proj(guidance)
|
31 |
+
guidance_emb = self.guidance_embedder(guidance_proj.to(dtype=pooled_projection.dtype)) # (N, D)
|
32 |
+
|
33 |
+
time_guidance_emb = timesteps_emb + guidance_emb
|
34 |
+
|
35 |
+
pooled_projections = self.text_embedder(pooled_projection)
|
36 |
+
conditioning = time_guidance_emb + pooled_projections
|
37 |
+
else:
|
38 |
+
pooled_projections = self.text_embedder(pooled_projection)
|
39 |
+
conditioning = timesteps_emb + pooled_projections
|
40 |
+
|
41 |
+
return conditioning
|
42 |
+
|
43 |
+
|
44 |
+
def apply_rotary_emb(
|
45 |
+
x: torch.Tensor,
|
46 |
+
freqs_cis: Union[torch.Tensor, Tuple[torch.Tensor]],
|
47 |
+
use_real: bool = True,
|
48 |
+
use_real_unbind_dim: int = -1,
|
49 |
+
) -> Tuple[torch.Tensor, torch.Tensor]:
|
50 |
+
"""
|
51 |
+
Apply rotary embeddings to input tensors using the given frequency tensor. This function applies rotary embeddings
|
52 |
+
to the given query or key 'x' tensors using the provided frequency tensor 'freqs_cis'. The input tensors are
|
53 |
+
reshaped as complex numbers, and the frequency tensor is reshaped for broadcasting compatibility. The resulting
|
54 |
+
tensors contain rotary embeddings and are returned as real tensors.
|
55 |
+
|
56 |
+
Args:
|
57 |
+
x (`torch.Tensor`):
|
58 |
+
Query or key tensor to apply rotary embeddings. [B, H, S, D] xk (torch.Tensor): Key tensor to apply
|
59 |
+
freqs_cis (`Tuple[torch.Tensor]`): Precomputed frequency tensor for complex exponentials. ([S, D], [S, D],)
|
60 |
+
|
61 |
+
Returns:
|
62 |
+
Tuple[torch.Tensor, torch.Tensor]: Tuple of modified query tensor and key tensor with rotary embeddings.
|
63 |
+
"""
|
64 |
+
if use_real:
|
65 |
+
cos, sin = freqs_cis # [S, D]
|
66 |
+
if cos.ndim == 2:
|
67 |
+
cos = cos[None, None]
|
68 |
+
else:
|
69 |
+
cos = cos.unsqueeze(1)
|
70 |
+
if sin.ndim == 2:
|
71 |
+
sin = sin[None, None]
|
72 |
+
else:
|
73 |
+
sin = sin.unsqueeze(1)
|
74 |
+
cos, sin = cos.to(x.device), sin.to(x.device)
|
75 |
+
|
76 |
+
if use_real_unbind_dim == -1:
|
77 |
+
# Used for flux, cogvideox, hunyuan-dit
|
78 |
+
x_real, x_imag = x.reshape(*x.shape[:-1], -1, 2).unbind(-1) # [B, S, H, D//2]
|
79 |
+
x_rotated = torch.stack([-x_imag, x_real], dim=-1).flatten(3)
|
80 |
+
elif use_real_unbind_dim == -2:
|
81 |
+
# Used for Stable Audio
|
82 |
+
x_real, x_imag = x.reshape(*x.shape[:-1], 2, -1).unbind(-2) # [B, S, H, D//2]
|
83 |
+
x_rotated = torch.cat([-x_imag, x_real], dim=-1)
|
84 |
+
else:
|
85 |
+
raise ValueError(f"`use_real_unbind_dim={use_real_unbind_dim}` but should be -1 or -2.")
|
86 |
+
|
87 |
+
out = (x.float() * cos + x_rotated.float() * sin).to(x.dtype)
|
88 |
+
|
89 |
+
return out
|
90 |
+
else:
|
91 |
+
# used for lumina
|
92 |
+
x_rotated = torch.view_as_complex(x.float().reshape(*x.shape[:-1], -1, 2))
|
93 |
+
freqs_cis = freqs_cis.unsqueeze(2)
|
94 |
+
x_out = torch.view_as_real(x_rotated * freqs_cis).flatten(3)
|
95 |
+
|
96 |
+
return x_out.type_as(x)
|
97 |
+
|
98 |
+
class FluxAttnSharedProcessor2_0:
|
99 |
+
"""Attention processor used typically in processing the SD3-like self-attention projections."""
|
100 |
+
|
101 |
+
def __init__(self):
|
102 |
+
if not hasattr(F, "scaled_dot_product_attention"):
|
103 |
+
raise ImportError("FluxAttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
|
104 |
+
|
105 |
+
def __call__(
|
106 |
+
self,
|
107 |
+
attn: Attention,
|
108 |
+
hidden_states: torch.FloatTensor,
|
109 |
+
encoder_hidden_states: torch.FloatTensor = None,
|
110 |
+
attention_mask: Optional[torch.FloatTensor] = None,
|
111 |
+
image_rotary_emb: Optional[torch.Tensor] = None,
|
112 |
+
data_num_per_group: Optional[int] = 1,
|
113 |
+
max_sequence_length: Optional[int] = 512,
|
114 |
+
mix_attention: bool = True,
|
115 |
+
cond_latents = None,
|
116 |
+
cond_image_rotary_emb = None,
|
117 |
+
work_mode = None,
|
118 |
+
mask_cond = None,
|
119 |
+
) -> torch.FloatTensor:
|
120 |
+
with_cond = cond_latents is not None and mix_attention
|
121 |
+
|
122 |
+
batch_size, _, _ = hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
|
123 |
+
|
124 |
+
# `sample` projections.
|
125 |
+
query = attn.to_q(hidden_states)
|
126 |
+
key = attn.to_k(hidden_states)
|
127 |
+
value = attn.to_v(hidden_states)
|
128 |
+
|
129 |
+
inner_dim = key.shape[-1]
|
130 |
+
head_dim = inner_dim // attn.heads
|
131 |
+
|
132 |
+
query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
|
133 |
+
key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
|
134 |
+
value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
|
135 |
+
|
136 |
+
if attn.norm_q is not None:
|
137 |
+
query = attn.norm_q(query)
|
138 |
+
if attn.norm_k is not None:
|
139 |
+
key = attn.norm_k(key)
|
140 |
+
|
141 |
+
|
142 |
+
# the attention in FluxSingleTransformerBlock does not use `encoder_hidden_states`
|
143 |
+
if encoder_hidden_states is not None:
|
144 |
+
# `context` projections.
|
145 |
+
encoder_hidden_states_query_proj = attn.add_q_proj(encoder_hidden_states)
|
146 |
+
encoder_hidden_states_key_proj = attn.add_k_proj(encoder_hidden_states)
|
147 |
+
encoder_hidden_states_value_proj = attn.add_v_proj(encoder_hidden_states)
|
148 |
+
|
149 |
+
encoder_hidden_states_query_proj = encoder_hidden_states_query_proj.view(
|
150 |
+
batch_size, -1, attn.heads, head_dim
|
151 |
+
).transpose(1, 2)
|
152 |
+
encoder_hidden_states_key_proj = encoder_hidden_states_key_proj.view(
|
153 |
+
batch_size, -1, attn.heads, head_dim
|
154 |
+
).transpose(1, 2)
|
155 |
+
encoder_hidden_states_value_proj = encoder_hidden_states_value_proj.view(
|
156 |
+
batch_size, -1, attn.heads, head_dim
|
157 |
+
).transpose(1, 2)
|
158 |
+
|
159 |
+
if attn.norm_added_q is not None:
|
160 |
+
encoder_hidden_states_query_proj = attn.norm_added_q(encoder_hidden_states_query_proj)
|
161 |
+
if attn.norm_added_k is not None:
|
162 |
+
encoder_hidden_states_key_proj = attn.norm_added_k(encoder_hidden_states_key_proj)
|
163 |
+
|
164 |
+
# attention
|
165 |
+
query = torch.cat([encoder_hidden_states_query_proj, query], dim=2)
|
166 |
+
key = torch.cat([encoder_hidden_states_key_proj, key], dim=2)
|
167 |
+
value = torch.cat([encoder_hidden_states_value_proj, value], dim=2)
|
168 |
+
|
169 |
+
if image_rotary_emb is not None:
|
170 |
+
query = apply_rotary_emb(query, image_rotary_emb)
|
171 |
+
key = apply_rotary_emb(key, image_rotary_emb)
|
172 |
+
|
173 |
+
if with_cond:
|
174 |
+
cond_bs = cond_latents.shape[0]
|
175 |
+
|
176 |
+
# update condition
|
177 |
+
cond_query = attn.to_q(cond_latents)
|
178 |
+
cond_query = cond_query.view(cond_bs, -1, attn.heads, head_dim).transpose(1, 2)
|
179 |
+
if attn.norm_q is not None:
|
180 |
+
cond_query = attn.norm_q(cond_query)
|
181 |
+
cond_query = apply_rotary_emb(cond_query, cond_image_rotary_emb)
|
182 |
+
cond_query = torch.cat(cond_query.chunk(len(cond_query), dim=0), dim=2)
|
183 |
+
|
184 |
+
cond_key = attn.to_k(cond_latents)
|
185 |
+
cond_value = attn.to_v(cond_latents)
|
186 |
+
cond_key = cond_key.view(cond_bs, -1, attn.heads, head_dim).transpose(1, 2)
|
187 |
+
cond_value = cond_value.view(cond_bs, -1, attn.heads, head_dim).transpose(1, 2)
|
188 |
+
if attn.norm_k is not None:
|
189 |
+
cond_key = attn.norm_k(cond_key)
|
190 |
+
|
191 |
+
cond_key = apply_rotary_emb(cond_key, cond_image_rotary_emb)
|
192 |
+
|
193 |
+
cond_key = torch.cat(cond_key.chunk(len(cond_key), dim=0), dim=2)
|
194 |
+
cond_value = torch.cat(cond_value.chunk(len(cond_value), dim=0), dim=2)
|
195 |
+
|
196 |
+
if data_num_per_group > 1 and mix_attention:
|
197 |
+
E = max_sequence_length # according to text len
|
198 |
+
|
199 |
+
key_enc, key_hid = key[:, :, :E], key[:, :, E:]
|
200 |
+
value_enc, value_hid = value[:, :, :E], value[:, :, E:]
|
201 |
+
|
202 |
+
key_layer = key_hid.chunk(data_num_per_group, dim=0)
|
203 |
+
key_layer = torch.cat(key_layer, dim=2).repeat(data_num_per_group, 1, 1, 1)
|
204 |
+
|
205 |
+
value_layer = value_hid.chunk(data_num_per_group, dim=0)
|
206 |
+
value_layer = torch.cat(value_layer, dim=2).repeat(data_num_per_group, 1, 1, 1)
|
207 |
+
|
208 |
+
key = torch.cat([key_enc, key_layer], dim=2)
|
209 |
+
value = torch.cat([value_enc, value_layer], dim=2)
|
210 |
+
|
211 |
+
elif data_num_per_group == 1 and mix_attention and with_cond:
|
212 |
+
E = max_sequence_length # according to text len
|
213 |
+
|
214 |
+
key_enc, key_hid = key[:, :, :E], key[:, :, E:]
|
215 |
+
value_enc, value_hid = value[:, :, :E], value[:, :, E:]
|
216 |
+
|
217 |
+
# todo: support bs != 1
|
218 |
+
key_layer = torch.cat([key_hid, cond_key], dim=2)
|
219 |
+
value_layer = torch.cat([value_hid, cond_value], dim=2)
|
220 |
+
|
221 |
+
key = torch.cat([key_enc, key_layer], dim=2)
|
222 |
+
value = torch.cat([value_enc, value_layer], dim=2)
|
223 |
+
|
224 |
+
# concat query
|
225 |
+
query_enc, query_hid = query[:, :, :E], query[:, :, E:]
|
226 |
+
query_layer = torch.cat([query_hid, cond_query], dim=2)
|
227 |
+
query = torch.cat([query_enc, query_layer], dim=2)
|
228 |
+
|
229 |
+
hidden_states = F.scaled_dot_product_attention(
|
230 |
+
query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False,
|
231 |
+
)
|
232 |
+
hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
|
233 |
+
hidden_states = hidden_states.to(query.dtype)
|
234 |
+
|
235 |
+
if encoder_hidden_states is not None:
|
236 |
+
if with_cond:
|
237 |
+
encoder_hidden_states, hidden_states, cond_latents = (
|
238 |
+
hidden_states[:, : encoder_hidden_states.shape[1]],
|
239 |
+
hidden_states[:, encoder_hidden_states.shape[1] : -cond_latents.shape[1]*cond_bs],
|
240 |
+
hidden_states[:, -cond_latents.shape[1]*cond_bs :],
|
241 |
+
)
|
242 |
+
cond_latents = cond_latents.view(cond_bs, cond_latents.shape[1] // cond_bs, cond_latents.shape[2])
|
243 |
+
cond_latents = attn.to_out[0](cond_latents)
|
244 |
+
cond_latents = attn.to_out[1](cond_latents)
|
245 |
+
else:
|
246 |
+
encoder_hidden_states, hidden_states = (
|
247 |
+
hidden_states[:, : encoder_hidden_states.shape[1]],
|
248 |
+
hidden_states[:, encoder_hidden_states.shape[1]:],
|
249 |
+
)
|
250 |
+
|
251 |
+
# linear proj
|
252 |
+
hidden_states = attn.to_out[0](hidden_states)
|
253 |
+
# dropout
|
254 |
+
hidden_states = attn.to_out[1](hidden_states)
|
255 |
+
|
256 |
+
encoder_hidden_states = attn.to_add_out(encoder_hidden_states)
|
257 |
+
|
258 |
+
if with_cond:
|
259 |
+
return hidden_states, encoder_hidden_states, cond_latents
|
260 |
+
return hidden_states, encoder_hidden_states
|
261 |
+
else:
|
262 |
+
if with_cond:
|
263 |
+
hidden_states, cond_latents = (
|
264 |
+
hidden_states[:, : -cond_latents.shape[1]*cond_bs],
|
265 |
+
hidden_states[:, -cond_latents.shape[1]*cond_bs :],
|
266 |
+
)
|
267 |
+
cond_latents = cond_latents.view(cond_bs, cond_latents.shape[1] // cond_bs, cond_latents.shape[2])
|
268 |
+
return hidden_states, cond_latents
|
269 |
+
return hidden_states
|
dreamfuse/models/dreamfuse_flux/transformer.py
ADDED
@@ -0,0 +1,866 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright 2024 Black Forest Labs, The HuggingFace Team and The InstantX Team. All rights reserved.
|
2 |
+
#
|
3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
+
# you may not use this file except in compliance with the License.
|
5 |
+
# You may obtain a copy of the License at
|
6 |
+
#
|
7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8 |
+
#
|
9 |
+
# Unless required by applicable law or agreed to in writing, software
|
10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
+
# See the License for the specific language governing permissions and
|
13 |
+
# limitations under the License.
|
14 |
+
|
15 |
+
|
16 |
+
from typing import Any, Dict, Optional, Tuple, Union
|
17 |
+
|
18 |
+
import numpy as np
|
19 |
+
import torch
|
20 |
+
import torch.nn as nn
|
21 |
+
import torch.nn.functional as F
|
22 |
+
|
23 |
+
from diffusers.configuration_utils import ConfigMixin, register_to_config
|
24 |
+
from diffusers.loaders import FluxTransformer2DLoadersMixin, FromOriginalModelMixin, PeftAdapterMixin
|
25 |
+
from diffusers.models.attention import FeedForward
|
26 |
+
from diffusers.models.attention_processor import (
|
27 |
+
Attention,
|
28 |
+
AttentionProcessor,
|
29 |
+
FluxAttnProcessor2_0,
|
30 |
+
FluxAttnProcessor2_0_NPU,
|
31 |
+
FusedFluxAttnProcessor2_0,
|
32 |
+
)
|
33 |
+
from dreamfuse.models.dreamfuse_flux.flux_processor import FluxAttnSharedProcessor2_0
|
34 |
+
from diffusers.models.modeling_utils import ModelMixin
|
35 |
+
from diffusers.models.normalization import AdaLayerNormContinuous, AdaLayerNormZero, AdaLayerNormZeroSingle
|
36 |
+
from diffusers.utils import USE_PEFT_BACKEND, is_torch_version, logging, scale_lora_layers, unscale_lora_layers
|
37 |
+
from diffusers.utils.import_utils import is_torch_npu_available
|
38 |
+
from diffusers.utils.torch_utils import maybe_allow_in_graph
|
39 |
+
from diffusers.models.embeddings import CombinedTimestepTextProjEmbeddings, FluxPosEmbed
|
40 |
+
from diffusers.models.modeling_outputs import Transformer2DModelOutput
|
41 |
+
|
42 |
+
from .flux_processor import CombinedTimestepGuidanceTextProjEmbeddings
|
43 |
+
|
44 |
+
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
|
45 |
+
|
46 |
+
def zero_module(module):
|
47 |
+
for p in module.parameters():
|
48 |
+
nn.init.zeros_(p)
|
49 |
+
return module
|
50 |
+
|
51 |
+
class LayerNorm2d(nn.Module):
|
52 |
+
def __init__(self, num_channels: int, eps: float = 1e-6) -> None:
|
53 |
+
super().__init__()
|
54 |
+
self.weight = nn.Parameter(torch.ones(num_channels))
|
55 |
+
self.bias = nn.Parameter(torch.zeros(num_channels))
|
56 |
+
self.eps = eps
|
57 |
+
|
58 |
+
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
59 |
+
u = x.mean(1, keepdim=True)
|
60 |
+
s = (x - u).pow(2).mean(1, keepdim=True)
|
61 |
+
x = (x - u) / torch.sqrt(s + self.eps)
|
62 |
+
x = self.weight[:, None, None] * x + self.bias[:, None, None]
|
63 |
+
return x
|
64 |
+
|
65 |
+
|
66 |
+
class CrossAttention(nn.Module):
|
67 |
+
def __init__(self, query_dim: int, cross_attention_dim: int, heads: int = 8, dim_head: int = 64, dropout: float = 0.0, bias: bool = False):
|
68 |
+
super().__init__()
|
69 |
+
self.heads = heads
|
70 |
+
self.dim_head = cross_attention_dim // heads
|
71 |
+
self.attn_to_q = nn.Linear(query_dim, cross_attention_dim, bias=bias)
|
72 |
+
self.norm_q = nn.LayerNorm(self.dim_head)
|
73 |
+
|
74 |
+
self.attn_to_k = nn.Linear(cross_attention_dim, cross_attention_dim, bias=bias)
|
75 |
+
self.norm_k = nn.LayerNorm(self.dim_head)
|
76 |
+
|
77 |
+
self.attn_to_v = nn.Linear(cross_attention_dim, cross_attention_dim, bias=bias)
|
78 |
+
|
79 |
+
self.attn_to_out = nn.ModuleList([])
|
80 |
+
self.attn_to_out.append(nn.Linear(query_dim, query_dim, bias=bias))
|
81 |
+
self.attn_to_out.append(nn.Dropout(dropout))
|
82 |
+
|
83 |
+
# zero init
|
84 |
+
with torch.no_grad():
|
85 |
+
self.attn_to_out[0].weight.fill_(0)
|
86 |
+
# self.to_out[0].bias.fill_(0)
|
87 |
+
|
88 |
+
def forward(self, hidden_states, encoder_hidden_states, attention_mask=None):
|
89 |
+
batch_size, sequence_length, _ = hidden_states.shape
|
90 |
+
|
91 |
+
query = self.attn_to_q(hidden_states)
|
92 |
+
key = self.attn_to_k(encoder_hidden_states)
|
93 |
+
value = self.attn_to_v(encoder_hidden_states)
|
94 |
+
|
95 |
+
inner_dim = key.shape[-1]
|
96 |
+
head_dim = inner_dim // self.heads
|
97 |
+
|
98 |
+
query = query.view(batch_size, -1, self.heads, head_dim).transpose(1, 2)
|
99 |
+
key = key.view(batch_size, -1, self.heads, head_dim).transpose(1, 2)
|
100 |
+
value = value.view(batch_size, -1, self.heads, head_dim).transpose(1, 2)
|
101 |
+
|
102 |
+
query = self.norm_q(query)
|
103 |
+
key = self.norm_k(key)
|
104 |
+
|
105 |
+
hidden_states = F.scaled_dot_product_attention(query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False,)
|
106 |
+
hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, self.heads * head_dim)
|
107 |
+
|
108 |
+
hidden_states = self.attn_to_out[0](hidden_states)
|
109 |
+
hidden_states = self.attn_to_out[1](hidden_states)
|
110 |
+
|
111 |
+
return hidden_states
|
112 |
+
|
113 |
+
@maybe_allow_in_graph
|
114 |
+
class FluxSingleTransformerBlock(nn.Module):
|
115 |
+
r"""
|
116 |
+
A Transformer block following the MMDiT architecture, introduced in Stable Diffusion 3.
|
117 |
+
|
118 |
+
Reference: https://arxiv.org/abs/2403.03206
|
119 |
+
|
120 |
+
Parameters:
|
121 |
+
dim (`int`): The number of channels in the input and output.
|
122 |
+
num_attention_heads (`int`): The number of heads to use for multi-head attention.
|
123 |
+
attention_head_dim (`int`): The number of channels in each head.
|
124 |
+
context_pre_only (`bool`): Boolean to determine if we should add some blocks associated with the
|
125 |
+
processing of `context` conditions.
|
126 |
+
"""
|
127 |
+
|
128 |
+
def __init__(self, dim, num_attention_heads, attention_head_dim, mlp_ratio=4.0):
|
129 |
+
super().__init__()
|
130 |
+
self.mlp_hidden_dim = int(dim * mlp_ratio)
|
131 |
+
|
132 |
+
self.norm = AdaLayerNormZeroSingle(dim)
|
133 |
+
self.proj_mlp = nn.Linear(dim, self.mlp_hidden_dim)
|
134 |
+
self.act_mlp = nn.GELU(approximate="tanh")
|
135 |
+
self.proj_out = nn.Linear(dim + self.mlp_hidden_dim, dim)
|
136 |
+
|
137 |
+
processor = FluxAttnSharedProcessor2_0()
|
138 |
+
|
139 |
+
self.attn = Attention(
|
140 |
+
query_dim=dim,
|
141 |
+
cross_attention_dim=None,
|
142 |
+
dim_head=attention_head_dim,
|
143 |
+
heads=num_attention_heads,
|
144 |
+
out_dim=dim,
|
145 |
+
bias=True,
|
146 |
+
processor=processor,
|
147 |
+
qk_norm="rms_norm",
|
148 |
+
eps=1e-6,
|
149 |
+
pre_only=True,
|
150 |
+
)
|
151 |
+
|
152 |
+
def forward(
|
153 |
+
self,
|
154 |
+
hidden_states: torch.FloatTensor,
|
155 |
+
temb: torch.FloatTensor,
|
156 |
+
image_rotary_emb=None,
|
157 |
+
data_num_per_group=1,
|
158 |
+
max_sequence_length=512,
|
159 |
+
mix_attention: bool = True,
|
160 |
+
cond_temb = None,
|
161 |
+
cond_image_rotary_emb = None,
|
162 |
+
cond_latents = None,
|
163 |
+
joint_attention_kwargs=None,
|
164 |
+
|
165 |
+
):
|
166 |
+
with_cond = cond_latents is not None and mix_attention
|
167 |
+
|
168 |
+
residual = hidden_states
|
169 |
+
norm_hidden_states, gate = self.norm(hidden_states, emb=temb)
|
170 |
+
mlp_hidden_states = self.act_mlp(self.proj_mlp(norm_hidden_states))
|
171 |
+
|
172 |
+
if with_cond:
|
173 |
+
residual_cond = cond_latents
|
174 |
+
norm_cond_latents, cond_gate = self.norm(cond_latents, emb=cond_temb)
|
175 |
+
mlp_cond_hidden_states = self.act_mlp(self.proj_mlp(norm_cond_latents))
|
176 |
+
|
177 |
+
joint_attention_kwargs = joint_attention_kwargs or {}
|
178 |
+
attn_output = self.attn(
|
179 |
+
hidden_states=norm_hidden_states,
|
180 |
+
image_rotary_emb=image_rotary_emb,
|
181 |
+
data_num_per_group=data_num_per_group,
|
182 |
+
max_sequence_length=max_sequence_length,
|
183 |
+
mix_attention=mix_attention,
|
184 |
+
cond_latents=norm_cond_latents if with_cond else None,
|
185 |
+
cond_image_rotary_emb=cond_image_rotary_emb if with_cond else None,
|
186 |
+
**joint_attention_kwargs,
|
187 |
+
)
|
188 |
+
|
189 |
+
if with_cond:
|
190 |
+
attn_output, cond_attn_output = attn_output
|
191 |
+
|
192 |
+
hidden_states = torch.cat([attn_output, mlp_hidden_states], dim=2)
|
193 |
+
gate = gate.unsqueeze(1)
|
194 |
+
hidden_states = gate * self.proj_out(hidden_states)
|
195 |
+
hidden_states = residual + hidden_states
|
196 |
+
|
197 |
+
if with_cond:
|
198 |
+
cond_latents = torch.cat([cond_attn_output, mlp_cond_hidden_states], dim=2)
|
199 |
+
cond_gate = cond_gate.unsqueeze(1)
|
200 |
+
cond_latents = cond_gate * self.proj_out(cond_latents)
|
201 |
+
cond_latents = residual_cond + cond_latents
|
202 |
+
|
203 |
+
if hidden_states.dtype == torch.float16:
|
204 |
+
hidden_states = hidden_states.clip(-65504, 65504)
|
205 |
+
|
206 |
+
if with_cond:
|
207 |
+
return hidden_states, cond_latents
|
208 |
+
else:
|
209 |
+
return hidden_states
|
210 |
+
|
211 |
+
|
212 |
+
@maybe_allow_in_graph
|
213 |
+
class FluxTransformerBlock(nn.Module):
|
214 |
+
r"""
|
215 |
+
A Transformer block following the MMDiT architecture, introduced in Stable Diffusion 3.
|
216 |
+
|
217 |
+
Reference: https://arxiv.org/abs/2403.03206
|
218 |
+
|
219 |
+
Parameters:
|
220 |
+
dim (`int`): The number of channels in the input and output.
|
221 |
+
num_attention_heads (`int`): The number of heads to use for multi-head attention.
|
222 |
+
attention_head_dim (`int`): The number of channels in each head.
|
223 |
+
context_pre_only (`bool`): Boolean to determine if we should add some blocks associated with the
|
224 |
+
processing of `context` conditions.
|
225 |
+
"""
|
226 |
+
|
227 |
+
def __init__(self, dim, num_attention_heads, attention_head_dim, qk_norm="rms_norm", eps=1e-6):
|
228 |
+
super().__init__()
|
229 |
+
|
230 |
+
self.norm1 = AdaLayerNormZero(dim)
|
231 |
+
|
232 |
+
self.norm1_context = AdaLayerNormZero(dim)
|
233 |
+
|
234 |
+
processor = FluxAttnSharedProcessor2_0()
|
235 |
+
|
236 |
+
self.attn = Attention(
|
237 |
+
query_dim=dim,
|
238 |
+
cross_attention_dim=None,
|
239 |
+
added_kv_proj_dim=dim,
|
240 |
+
dim_head=attention_head_dim,
|
241 |
+
heads=num_attention_heads,
|
242 |
+
out_dim=dim,
|
243 |
+
context_pre_only=False,
|
244 |
+
bias=True,
|
245 |
+
processor=processor,
|
246 |
+
qk_norm=qk_norm,
|
247 |
+
eps=eps,
|
248 |
+
)
|
249 |
+
|
250 |
+
self.norm2 = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
|
251 |
+
self.ff = FeedForward(dim=dim, dim_out=dim, activation_fn="gelu-approximate")
|
252 |
+
|
253 |
+
self.norm2_context = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
|
254 |
+
self.ff_context = FeedForward(dim=dim, dim_out=dim, activation_fn="gelu-approximate")
|
255 |
+
|
256 |
+
# let chunk size default to None
|
257 |
+
self._chunk_size = None
|
258 |
+
self._chunk_dim = 0
|
259 |
+
|
260 |
+
def forward(
|
261 |
+
self,
|
262 |
+
hidden_states: torch.FloatTensor,
|
263 |
+
encoder_hidden_states: torch.FloatTensor,
|
264 |
+
temb: torch.FloatTensor,
|
265 |
+
image_rotary_emb=None,
|
266 |
+
data_num_per_group=1,
|
267 |
+
max_sequence_length=512,
|
268 |
+
mix_attention: bool = True,
|
269 |
+
cond_temb = None,
|
270 |
+
cond_image_rotary_emb = None,
|
271 |
+
cond_latents = None,
|
272 |
+
joint_attention_kwargs=None,
|
273 |
+
):
|
274 |
+
norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(hidden_states, emb=temb)
|
275 |
+
|
276 |
+
norm_encoder_hidden_states, c_gate_msa, c_shift_mlp, c_scale_mlp, c_gate_mlp = self.norm1_context(
|
277 |
+
encoder_hidden_states, emb=temb
|
278 |
+
)
|
279 |
+
joint_attention_kwargs = joint_attention_kwargs or {}
|
280 |
+
|
281 |
+
with_cond = cond_latents is not None and mix_attention
|
282 |
+
if with_cond:
|
283 |
+
norm_cond_latents, cond_gate_msa, cond_shift_mlp, cond_scale_mlp, cond_gate_mlp = self.norm1(cond_latents, emb=cond_temb)
|
284 |
+
|
285 |
+
# Attention.
|
286 |
+
attention_outputs = self.attn(
|
287 |
+
hidden_states=norm_hidden_states,
|
288 |
+
encoder_hidden_states=norm_encoder_hidden_states,
|
289 |
+
image_rotary_emb=image_rotary_emb,
|
290 |
+
data_num_per_group=data_num_per_group,
|
291 |
+
max_sequence_length=max_sequence_length,
|
292 |
+
mix_attention=mix_attention,
|
293 |
+
cond_latents=norm_cond_latents if with_cond else None,
|
294 |
+
cond_image_rotary_emb=cond_image_rotary_emb if with_cond else None,
|
295 |
+
**joint_attention_kwargs,
|
296 |
+
)
|
297 |
+
|
298 |
+
if len(attention_outputs) == 2:
|
299 |
+
attn_output, context_attn_output = attention_outputs
|
300 |
+
elif len(attention_outputs) == 3 and with_cond:
|
301 |
+
attn_output, context_attn_output, cond_attn_output = attention_outputs
|
302 |
+
elif len(attention_outputs) == 3:
|
303 |
+
attn_output, context_attn_output, ip_attn_output = attention_outputs
|
304 |
+
|
305 |
+
# Process attention outputs for the `hidden_states`.
|
306 |
+
attn_output = gate_msa.unsqueeze(1) * attn_output
|
307 |
+
hidden_states = hidden_states + attn_output
|
308 |
+
|
309 |
+
norm_hidden_states = self.norm2(hidden_states)
|
310 |
+
norm_hidden_states = norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
|
311 |
+
|
312 |
+
ff_output = self.ff(norm_hidden_states)
|
313 |
+
ff_output = gate_mlp.unsqueeze(1) * ff_output
|
314 |
+
|
315 |
+
hidden_states = hidden_states + ff_output
|
316 |
+
if len(attention_outputs) == 3 and not with_cond:
|
317 |
+
hidden_states = hidden_states + ip_attn_output
|
318 |
+
|
319 |
+
if with_cond:
|
320 |
+
cond_attn_output = cond_gate_msa.unsqueeze(1) * cond_attn_output
|
321 |
+
cond_latents = cond_latents + cond_attn_output
|
322 |
+
|
323 |
+
norm_cond_latents = self.norm2(cond_latents)
|
324 |
+
norm_cond_latents = norm_cond_latents * (1 + cond_scale_mlp[:, None]) + cond_shift_mlp[:, None]
|
325 |
+
|
326 |
+
cond_ff_output = self.ff(norm_cond_latents)
|
327 |
+
cond_ff_output = cond_gate_mlp.unsqueeze(1) * cond_ff_output
|
328 |
+
|
329 |
+
cond_latents = cond_latents + cond_ff_output
|
330 |
+
# Process attention outputs for the `encoder_hidden_states`.
|
331 |
+
|
332 |
+
context_attn_output = c_gate_msa.unsqueeze(1) * context_attn_output
|
333 |
+
encoder_hidden_states = encoder_hidden_states + context_attn_output
|
334 |
+
|
335 |
+
norm_encoder_hidden_states = self.norm2_context(encoder_hidden_states)
|
336 |
+
norm_encoder_hidden_states = norm_encoder_hidden_states * (1 + c_scale_mlp[:, None]) + c_shift_mlp[:, None]
|
337 |
+
|
338 |
+
context_ff_output = self.ff_context(norm_encoder_hidden_states)
|
339 |
+
encoder_hidden_states = encoder_hidden_states + c_gate_mlp.unsqueeze(1) * context_ff_output
|
340 |
+
if encoder_hidden_states.dtype == torch.float16:
|
341 |
+
encoder_hidden_states = encoder_hidden_states.clip(-65504, 65504)
|
342 |
+
|
343 |
+
if with_cond:
|
344 |
+
return encoder_hidden_states, hidden_states, cond_latents
|
345 |
+
else:
|
346 |
+
return encoder_hidden_states, hidden_states
|
347 |
+
|
348 |
+
|
349 |
+
class FluxTransformer2DModel(
|
350 |
+
ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginalModelMixin, FluxTransformer2DLoadersMixin
|
351 |
+
):
|
352 |
+
"""
|
353 |
+
The Transformer model introduced in Flux.
|
354 |
+
|
355 |
+
Reference: https://blackforestlabs.ai/announcing-black-forest-labs/
|
356 |
+
|
357 |
+
Parameters:
|
358 |
+
patch_size (`int`): Patch size to turn the input data into small patches.
|
359 |
+
in_channels (`int`, *optional*, defaults to 16): The number of channels in the input.
|
360 |
+
num_layers (`int`, *optional*, defaults to 18): The number of layers of MMDiT blocks to use.
|
361 |
+
num_single_layers (`int`, *optional*, defaults to 18): The number of layers of single DiT blocks to use.
|
362 |
+
attention_head_dim (`int`, *optional*, defaults to 64): The number of channels in each head.
|
363 |
+
num_attention_heads (`int`, *optional*, defaults to 18): The number of heads to use for multi-head attention.
|
364 |
+
joint_attention_dim (`int`, *optional*): The number of `encoder_hidden_states` dimensions to use.
|
365 |
+
pooled_projection_dim (`int`): Number of dimensions to use when projecting the `pooled_projections`.
|
366 |
+
guidance_embeds (`bool`, defaults to False): Whether to use guidance embeddings.
|
367 |
+
"""
|
368 |
+
|
369 |
+
_supports_gradient_checkpointing = True
|
370 |
+
_no_split_modules = ["FluxTransformerBlock", "FluxSingleTransformerBlock"]
|
371 |
+
|
372 |
+
@register_to_config
|
373 |
+
def __init__(
|
374 |
+
self,
|
375 |
+
patch_size: int = 1,
|
376 |
+
in_channels: int = 64,
|
377 |
+
out_channels: Optional[int] = None,
|
378 |
+
num_layers: int = 19,
|
379 |
+
num_single_layers: int = 38,
|
380 |
+
attention_head_dim: int = 128,
|
381 |
+
num_attention_heads: int = 24,
|
382 |
+
joint_attention_dim: int = 4096,
|
383 |
+
pooled_projection_dim: int = 768,
|
384 |
+
guidance_embeds: bool = False,
|
385 |
+
axes_dims_rope: Tuple[int] = (16, 56, 56),
|
386 |
+
):
|
387 |
+
super().__init__()
|
388 |
+
self.out_channels = out_channels or in_channels
|
389 |
+
self.inner_dim = self.config.num_attention_heads * self.config.attention_head_dim
|
390 |
+
if getattr(self.config, "num_image_tag_embeddings", None) is not None:
|
391 |
+
self.image_tag_embeddings = nn.Embedding(self.config.num_image_tag_embeddings, self.inner_dim)
|
392 |
+
if getattr(self.config, "num_context_tag_embeddings", None) is not None:
|
393 |
+
self.context_tag_embeddings = nn.Embedding(self.config.num_context_tag_embeddings, self.inner_dim)
|
394 |
+
|
395 |
+
self.pos_embed = FluxPosEmbed(theta=10000, axes_dim=axes_dims_rope)
|
396 |
+
|
397 |
+
text_time_guidance_cls = (
|
398 |
+
CombinedTimestepGuidanceTextProjEmbeddings if guidance_embeds else CombinedTimestepTextProjEmbeddings
|
399 |
+
)
|
400 |
+
self.time_text_embed = text_time_guidance_cls(
|
401 |
+
embedding_dim=self.inner_dim, pooled_projection_dim=self.config.pooled_projection_dim
|
402 |
+
)
|
403 |
+
|
404 |
+
self.context_embedder = nn.Linear(self.config.joint_attention_dim, self.inner_dim)
|
405 |
+
self.x_embedder = nn.Linear(self.config.in_channels, self.inner_dim)
|
406 |
+
|
407 |
+
self.transformer_blocks = nn.ModuleList(
|
408 |
+
[
|
409 |
+
FluxTransformerBlock(
|
410 |
+
dim=self.inner_dim,
|
411 |
+
num_attention_heads=self.config.num_attention_heads,
|
412 |
+
attention_head_dim=self.config.attention_head_dim,
|
413 |
+
)
|
414 |
+
for i in range(self.config.num_layers)
|
415 |
+
]
|
416 |
+
)
|
417 |
+
|
418 |
+
self.single_transformer_blocks = nn.ModuleList(
|
419 |
+
[
|
420 |
+
FluxSingleTransformerBlock(
|
421 |
+
dim=self.inner_dim,
|
422 |
+
num_attention_heads=self.config.num_attention_heads,
|
423 |
+
attention_head_dim=self.config.attention_head_dim,
|
424 |
+
)
|
425 |
+
for i in range(self.config.num_single_layers)
|
426 |
+
]
|
427 |
+
)
|
428 |
+
|
429 |
+
self.norm_out = AdaLayerNormContinuous(self.inner_dim, self.inner_dim, elementwise_affine=False, eps=1e-6)
|
430 |
+
self.proj_out = nn.Linear(self.inner_dim, patch_size * patch_size * self.out_channels, bias=True)
|
431 |
+
|
432 |
+
self.gradient_checkpointing = False
|
433 |
+
|
434 |
+
def set_tag_embeddings(self, num_image_tag_embeddings=0, num_context_tag_embeddings=0):
|
435 |
+
if num_image_tag_embeddings > 0:
|
436 |
+
self.config.num_image_tag_embeddings = num_image_tag_embeddings
|
437 |
+
self.image_tag_embeddings = zero_module(nn.Embedding(self.config.num_image_tag_embeddings, self.inner_dim))
|
438 |
+
if num_context_tag_embeddings > 0:
|
439 |
+
self.config.num_context_tag_embeddings = num_context_tag_embeddings
|
440 |
+
self.context_tag_embeddings = zero_module(nn.Embedding(self.config.num_context_tag_embeddings, self.inner_dim))
|
441 |
+
|
442 |
+
def set_mask_tokenizer(self, mask_in_chans, mask_out_chans, activation = nn.GELU):
|
443 |
+
self.mask_tokenizer = nn.Sequential(
|
444 |
+
nn.Conv2d(1, mask_in_chans // 4, kernel_size=2, stride=2),
|
445 |
+
LayerNorm2d(mask_in_chans // 4),
|
446 |
+
activation(),
|
447 |
+
nn.Conv2d(mask_in_chans // 4, mask_in_chans, kernel_size=3, padding=1),
|
448 |
+
LayerNorm2d(mask_in_chans),
|
449 |
+
activation(),
|
450 |
+
nn.Conv2d(mask_in_chans, mask_out_chans, kernel_size=1),
|
451 |
+
nn.AdaptiveAvgPool2d((16, 16))
|
452 |
+
)
|
453 |
+
|
454 |
+
self.mask_attn = CrossAttention(mask_out_chans, mask_out_chans)
|
455 |
+
|
456 |
+
def forward_mask_attn(self, mask_images, fg_images):
|
457 |
+
mask_images = self.mask_tokenizer(mask_images)
|
458 |
+
mask_images = mask_images.flatten(2).transpose(1, 2)
|
459 |
+
mask_images = self.mask_attn(mask_images, fg_images, attention_mask=None)
|
460 |
+
return mask_images
|
461 |
+
|
462 |
+
@property
|
463 |
+
# Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors
|
464 |
+
def attn_processors(self) -> Dict[str, AttentionProcessor]:
|
465 |
+
r"""
|
466 |
+
Returns:
|
467 |
+
`dict` of attention processors: A dictionary containing all attention processors used in the model with
|
468 |
+
indexed by its weight name.
|
469 |
+
"""
|
470 |
+
# set recursively
|
471 |
+
processors = {}
|
472 |
+
|
473 |
+
def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
|
474 |
+
if hasattr(module, "get_processor"):
|
475 |
+
processors[f"{name}.processor"] = module.get_processor()
|
476 |
+
|
477 |
+
for sub_name, child in module.named_children():
|
478 |
+
fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
|
479 |
+
|
480 |
+
return processors
|
481 |
+
|
482 |
+
for name, module in self.named_children():
|
483 |
+
fn_recursive_add_processors(name, module, processors)
|
484 |
+
|
485 |
+
return processors
|
486 |
+
|
487 |
+
# Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor
|
488 |
+
def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
|
489 |
+
r"""
|
490 |
+
Sets the attention processor to use to compute attention.
|
491 |
+
|
492 |
+
Parameters:
|
493 |
+
processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
|
494 |
+
The instantiated processor class or a dictionary of processor classes that will be set as the processor
|
495 |
+
for **all** `Attention` layers.
|
496 |
+
|
497 |
+
If `processor` is a dict, the key needs to define the path to the corresponding cross attention
|
498 |
+
processor. This is strongly recommended when setting trainable attention processors.
|
499 |
+
|
500 |
+
"""
|
501 |
+
count = len(self.attn_processors.keys())
|
502 |
+
|
503 |
+
if isinstance(processor, dict) and len(processor) != count:
|
504 |
+
raise ValueError(
|
505 |
+
f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
|
506 |
+
f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
|
507 |
+
)
|
508 |
+
|
509 |
+
def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
|
510 |
+
if hasattr(module, "set_processor"):
|
511 |
+
if not isinstance(processor, dict):
|
512 |
+
module.set_processor(processor)
|
513 |
+
else:
|
514 |
+
module.set_processor(processor.pop(f"{name}.processor"))
|
515 |
+
|
516 |
+
for sub_name, child in module.named_children():
|
517 |
+
fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
|
518 |
+
|
519 |
+
for name, module in self.named_children():
|
520 |
+
fn_recursive_attn_processor(name, module, processor)
|
521 |
+
|
522 |
+
# Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.fuse_qkv_projections with FusedAttnProcessor2_0->FusedFluxAttnProcessor2_0
|
523 |
+
def fuse_qkv_projections(self):
|
524 |
+
"""
|
525 |
+
Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, key, value)
|
526 |
+
are fused. For cross-attention modules, key and value projection matrices are fused.
|
527 |
+
|
528 |
+
<Tip warning={true}>
|
529 |
+
|
530 |
+
This API is 🧪 experimental.
|
531 |
+
|
532 |
+
</Tip>
|
533 |
+
"""
|
534 |
+
self.original_attn_processors = None
|
535 |
+
|
536 |
+
for _, attn_processor in self.attn_processors.items():
|
537 |
+
if "Added" in str(attn_processor.__class__.__name__):
|
538 |
+
raise ValueError("`fuse_qkv_projections()` is not supported for models having added KV projections.")
|
539 |
+
|
540 |
+
self.original_attn_processors = self.attn_processors
|
541 |
+
|
542 |
+
for module in self.modules():
|
543 |
+
if isinstance(module, Attention):
|
544 |
+
module.fuse_projections(fuse=True)
|
545 |
+
|
546 |
+
self.set_attn_processor(FusedFluxAttnProcessor2_0())
|
547 |
+
|
548 |
+
# Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.unfuse_qkv_projections
|
549 |
+
def unfuse_qkv_projections(self):
|
550 |
+
"""Disables the fused QKV projection if enabled.
|
551 |
+
|
552 |
+
<Tip warning={true}>
|
553 |
+
|
554 |
+
This API is 🧪 experimental.
|
555 |
+
|
556 |
+
</Tip>
|
557 |
+
|
558 |
+
"""
|
559 |
+
if self.original_attn_processors is not None:
|
560 |
+
self.set_attn_processor(self.original_attn_processors)
|
561 |
+
|
562 |
+
def _set_gradient_checkpointing(self, module, value=False):
|
563 |
+
if hasattr(module, "gradient_checkpointing"):
|
564 |
+
module.gradient_checkpointing = value
|
565 |
+
|
566 |
+
def _format_input(self):
|
567 |
+
pass
|
568 |
+
|
569 |
+
def _format_output(self):
|
570 |
+
pass
|
571 |
+
|
572 |
+
def forward(
|
573 |
+
self,
|
574 |
+
hidden_states: torch.Tensor,
|
575 |
+
encoder_hidden_states: torch.Tensor = None,
|
576 |
+
cond_input: dict = None,
|
577 |
+
pooled_projections: torch.Tensor = None,
|
578 |
+
timestep: torch.LongTensor = None,
|
579 |
+
img_ids: torch.Tensor = None,
|
580 |
+
txt_ids: torch.Tensor = None,
|
581 |
+
guidance: torch.Tensor = None,
|
582 |
+
joint_attention_kwargs: Optional[Dict[str, Any]] = None,
|
583 |
+
controlnet_block_samples=None,
|
584 |
+
controlnet_single_block_samples=None,
|
585 |
+
return_dict: bool = True,
|
586 |
+
controlnet_blocks_repeat: bool = False,
|
587 |
+
data_num_per_group: int = 1,
|
588 |
+
image_tags=None,
|
589 |
+
context_tags=None,
|
590 |
+
max_sequence_length: int = 512,
|
591 |
+
mix_attention_double=True,
|
592 |
+
mix_attention_single=True,
|
593 |
+
) -> Union[torch.FloatTensor, Transformer2DModelOutput]:
|
594 |
+
"""
|
595 |
+
The [`FluxTransformer2DModel`] forward method.
|
596 |
+
|
597 |
+
Args:
|
598 |
+
hidden_states (`torch.FloatTensor` of shape `(batch size, channel, height, width)`):
|
599 |
+
Input `hidden_states`.
|
600 |
+
encoder_hidden_states (`torch.FloatTensor` of shape `(batch size, sequence_len, embed_dims)`):
|
601 |
+
Conditional embeddings (embeddings computed from the input conditions such as prompts) to use.
|
602 |
+
pooled_projections (`torch.FloatTensor` of shape `(batch_size, projection_dim)`): Embeddings projected
|
603 |
+
from the embeddings of input conditions.
|
604 |
+
timestep ( `torch.LongTensor`):
|
605 |
+
Used to indicate denoising step.
|
606 |
+
block_controlnet_hidden_states: (`list` of `torch.Tensor`):
|
607 |
+
A list of tensors that if specified are added to the residuals of transformer blocks.
|
608 |
+
joint_attention_kwargs (`dict`, *optional*):
|
609 |
+
A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
|
610 |
+
`self.processor` in
|
611 |
+
[diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
|
612 |
+
return_dict (`bool`, *optional*, defaults to `True`):
|
613 |
+
Whether or not to return a [`~models.transformer_2d.Transformer2DModelOutput`] instead of a plain
|
614 |
+
tuple.
|
615 |
+
|
616 |
+
Returns:
|
617 |
+
If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a
|
618 |
+
`tuple` where the first element is the sample tensor.
|
619 |
+
"""
|
620 |
+
if joint_attention_kwargs is not None:
|
621 |
+
joint_attention_kwargs = joint_attention_kwargs.copy()
|
622 |
+
lora_scale = joint_attention_kwargs.pop("scale", 1.0)
|
623 |
+
else:
|
624 |
+
lora_scale = 1.0
|
625 |
+
|
626 |
+
if USE_PEFT_BACKEND:
|
627 |
+
# weight the lora layers by setting `lora_scale` for each PEFT layer
|
628 |
+
scale_lora_layers(self, lora_scale)
|
629 |
+
else:
|
630 |
+
if joint_attention_kwargs is not None and joint_attention_kwargs.get("scale", None) is not None:
|
631 |
+
logger.warning(
|
632 |
+
"Passing `scale` via `joint_attention_kwargs` when not using the PEFT backend is ineffective."
|
633 |
+
)
|
634 |
+
|
635 |
+
hidden_states = self.x_embedder(hidden_states)
|
636 |
+
|
637 |
+
mask_cond = None
|
638 |
+
mask_ids = None
|
639 |
+
if cond_input is not None:
|
640 |
+
cond_image_latents = cond_input["image_latents"]
|
641 |
+
cond_image_ids = cond_input["image_ids"]
|
642 |
+
cond_latents = self.x_embedder(cond_image_latents)
|
643 |
+
|
644 |
+
if joint_attention_kwargs is not None and "mask_cond" in joint_attention_kwargs:
|
645 |
+
mask_cond = joint_attention_kwargs.pop("mask_cond")
|
646 |
+
mask_ids = joint_attention_kwargs.pop("mask_ids")
|
647 |
+
if mask_cond is not None:
|
648 |
+
mask_cond = self.forward_mask_attn(mask_cond, cond_latents[:1])
|
649 |
+
# joint_attention_kwargs["mask_cond"] = mask_cond
|
650 |
+
# hidden_states = hidden_states + mask_cond
|
651 |
+
|
652 |
+
if image_tags is not None:
|
653 |
+
image_tag_embeddings = self.image_tag_embeddings(
|
654 |
+
torch.Tensor(
|
655 |
+
image_tags,
|
656 |
+
).to(device=hidden_states.device, dtype=torch.int64)
|
657 |
+
)
|
658 |
+
bsz = hidden_states.shape[0] // data_num_per_group
|
659 |
+
image_tag_embeddings = image_tag_embeddings.repeat_interleave(bsz, dim=0)
|
660 |
+
if cond_input is not None:
|
661 |
+
hidden_states = hidden_states + image_tag_embeddings[0]
|
662 |
+
cond_latents = cond_latents + image_tag_embeddings[1:].unsqueeze(1)
|
663 |
+
else:
|
664 |
+
# for debug
|
665 |
+
if len(hidden_states) != len(image_tag_embeddings):
|
666 |
+
hidden_states += image_tag_embeddings[:1].unsqueeze(1)
|
667 |
+
else:
|
668 |
+
hidden_states = hidden_states + image_tag_embeddings.unsqueeze(1)
|
669 |
+
|
670 |
+
timestep = timestep.to(hidden_states.dtype) * 1000
|
671 |
+
if guidance is not None:
|
672 |
+
guidance = guidance.to(hidden_states.dtype) * 1000
|
673 |
+
else:
|
674 |
+
guidance = None
|
675 |
+
|
676 |
+
temb = (
|
677 |
+
self.time_text_embed(timestep, pooled_projections)
|
678 |
+
if guidance is None
|
679 |
+
else self.time_text_embed(timestep, guidance, pooled_projections)
|
680 |
+
)
|
681 |
+
if cond_input is not None:
|
682 |
+
cond_time = 0
|
683 |
+
cond_temb = ( self.time_text_embed(torch.ones_like(timestep)*cond_time, pooled_projections)
|
684 |
+
if guidance is None
|
685 |
+
else self.time_text_embed(torch.ones_like(timestep)*cond_time, guidance, pooled_projections)
|
686 |
+
)
|
687 |
+
encoder_hidden_states = self.context_embedder(encoder_hidden_states)
|
688 |
+
|
689 |
+
if context_tags is not None:
|
690 |
+
context_tag_embeddings = self.context_tag_embeddings(
|
691 |
+
torch.Tensor(
|
692 |
+
image_tags,
|
693 |
+
).to(device=hidden_states.device, dtype=torch.int64)
|
694 |
+
)
|
695 |
+
bsz = hidden_states.shape[0] // data_num_per_group
|
696 |
+
context_tag_embeddings = context_tag_embeddings.repeat_interleave(bsz, dim=0)
|
697 |
+
if cond_input is not None:
|
698 |
+
encoder_hidden_states = encoder_hidden_states + context_tag_embeddings[0]
|
699 |
+
else:
|
700 |
+
if len(encoder_hidden_states) != len(context_tag_embeddings):
|
701 |
+
encoder_hidden_states += context_tag_embeddings[:1].unsqueeze(1)
|
702 |
+
else:
|
703 |
+
encoder_hidden_states = encoder_hidden_states + context_tag_embeddings.unsqueeze(1)
|
704 |
+
|
705 |
+
if mask_cond is not None:
|
706 |
+
encoder_hidden_states = torch.cat([encoder_hidden_states, mask_cond], dim=1) # todo: compare with add
|
707 |
+
max_sequence_length = encoder_hidden_states.shape[1]
|
708 |
+
|
709 |
+
txt_ids = torch.cat((txt_ids, mask_ids), dim=0)
|
710 |
+
|
711 |
+
if isinstance(img_ids, list):
|
712 |
+
image_rotary_emb = []
|
713 |
+
for img_ids_ in img_ids:
|
714 |
+
ids = torch.cat((txt_ids, img_ids_), dim=0)
|
715 |
+
image_rotary_emb.append(self.pos_embed(ids))
|
716 |
+
image_rotary_emb = ( # to batch, cos / sin
|
717 |
+
torch.stack([_[0] for _ in image_rotary_emb]).repeat_interleave(hidden_states.shape[0] // len(img_ids), dim=0).clone(),
|
718 |
+
torch.stack([_[1] for _ in image_rotary_emb]).repeat_interleave(hidden_states.shape[0] // len(img_ids), dim=0).clone(),
|
719 |
+
)
|
720 |
+
else:
|
721 |
+
ids = torch.cat((txt_ids, img_ids), dim=0)
|
722 |
+
image_rotary_emb = self.pos_embed(ids)
|
723 |
+
if cond_input is not None:
|
724 |
+
cond_rotary_emb = []
|
725 |
+
for image_ids in cond_image_ids:
|
726 |
+
cond_rotary_emb.append(self.pos_embed(image_ids))
|
727 |
+
cond_rotary_emb = (
|
728 |
+
torch.stack([_[0] for _ in cond_rotary_emb]).repeat_interleave(cond_latents.shape[0] // len(cond_image_ids), dim=0).clone(),
|
729 |
+
torch.stack([_[1] for _ in cond_rotary_emb]).repeat_interleave(cond_latents.shape[0] // len(cond_image_ids), dim=0).clone(),
|
730 |
+
)
|
731 |
+
|
732 |
+
if joint_attention_kwargs is not None and "ip_adapter_image_embeds" in joint_attention_kwargs:
|
733 |
+
ip_adapter_image_embeds = joint_attention_kwargs.pop("ip_adapter_image_embeds")
|
734 |
+
ip_hidden_states = self.encoder_hid_proj(ip_adapter_image_embeds)
|
735 |
+
joint_attention_kwargs.update({"ip_hidden_states": ip_hidden_states})
|
736 |
+
|
737 |
+
for index_block, block in enumerate(self.transformer_blocks):
|
738 |
+
if torch.is_grad_enabled() and self.gradient_checkpointing:
|
739 |
+
|
740 |
+
def create_custom_forward(module, return_dict=None):
|
741 |
+
def custom_forward(*inputs):
|
742 |
+
if return_dict is not None:
|
743 |
+
return module(*inputs, return_dict=return_dict)
|
744 |
+
else:
|
745 |
+
return module(*inputs)
|
746 |
+
|
747 |
+
return custom_forward
|
748 |
+
|
749 |
+
ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
|
750 |
+
# ckpt_kwargs.updata(joint_attention_kwargs)
|
751 |
+
block_output = torch.utils.checkpoint.checkpoint(
|
752 |
+
create_custom_forward(block),
|
753 |
+
hidden_states,
|
754 |
+
encoder_hidden_states,
|
755 |
+
temb,
|
756 |
+
image_rotary_emb,
|
757 |
+
data_num_per_group,
|
758 |
+
max_sequence_length,
|
759 |
+
mix_attention_double,
|
760 |
+
cond_temb if cond_input is not None else None,
|
761 |
+
cond_rotary_emb if cond_input is not None else None,
|
762 |
+
cond_latents if cond_input is not None else None,
|
763 |
+
joint_attention_kwargs,
|
764 |
+
**ckpt_kwargs,
|
765 |
+
)
|
766 |
+
else:
|
767 |
+
block_output = block(
|
768 |
+
hidden_states=hidden_states,
|
769 |
+
encoder_hidden_states=encoder_hidden_states,
|
770 |
+
temb=temb,
|
771 |
+
image_rotary_emb=image_rotary_emb,
|
772 |
+
data_num_per_group=data_num_per_group,
|
773 |
+
max_sequence_length=max_sequence_length,
|
774 |
+
mix_attention=mix_attention_double,
|
775 |
+
cond_temb = cond_temb if cond_input is not None else None,
|
776 |
+
cond_image_rotary_emb = cond_rotary_emb if cond_input is not None else None,
|
777 |
+
cond_latents = cond_latents if cond_input is not None else None,
|
778 |
+
joint_attention_kwargs=joint_attention_kwargs,
|
779 |
+
)
|
780 |
+
|
781 |
+
if cond_input is not None and mix_attention_double:
|
782 |
+
encoder_hidden_states, hidden_states, cond_latents = block_output
|
783 |
+
else:
|
784 |
+
encoder_hidden_states, hidden_states = block_output
|
785 |
+
|
786 |
+
# controlnet residual
|
787 |
+
if controlnet_block_samples is not None:
|
788 |
+
interval_control = len(self.transformer_blocks) / len(controlnet_block_samples)
|
789 |
+
interval_control = int(np.ceil(interval_control))
|
790 |
+
# For Xlabs ControlNet.
|
791 |
+
if controlnet_blocks_repeat:
|
792 |
+
hidden_states = (
|
793 |
+
hidden_states + controlnet_block_samples[index_block % len(controlnet_block_samples)]
|
794 |
+
)
|
795 |
+
else:
|
796 |
+
hidden_states = hidden_states + controlnet_block_samples[index_block // interval_control]
|
797 |
+
|
798 |
+
hidden_states = torch.cat([encoder_hidden_states, hidden_states], dim=1)
|
799 |
+
|
800 |
+
for index_block, block in enumerate(self.single_transformer_blocks):
|
801 |
+
if torch.is_grad_enabled() and self.gradient_checkpointing:
|
802 |
+
|
803 |
+
def create_custom_forward(module, return_dict=None):
|
804 |
+
def custom_forward(*inputs):
|
805 |
+
if return_dict is not None:
|
806 |
+
return module(*inputs, return_dict=return_dict)
|
807 |
+
else:
|
808 |
+
return module(*inputs)
|
809 |
+
|
810 |
+
return custom_forward
|
811 |
+
|
812 |
+
ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
|
813 |
+
hidden_states = torch.utils.checkpoint.checkpoint(
|
814 |
+
create_custom_forward(block),
|
815 |
+
hidden_states,
|
816 |
+
temb,
|
817 |
+
image_rotary_emb,
|
818 |
+
data_num_per_group,
|
819 |
+
max_sequence_length,
|
820 |
+
mix_attention_single,
|
821 |
+
cond_temb if cond_input is not None else None,
|
822 |
+
cond_rotary_emb if cond_input is not None else None,
|
823 |
+
cond_latents if cond_input is not None else None,
|
824 |
+
joint_attention_kwargs,
|
825 |
+
**ckpt_kwargs,
|
826 |
+
)
|
827 |
+
|
828 |
+
else:
|
829 |
+
hidden_states = block(
|
830 |
+
hidden_states=hidden_states,
|
831 |
+
temb=temb,
|
832 |
+
image_rotary_emb=image_rotary_emb,
|
833 |
+
data_num_per_group=data_num_per_group,
|
834 |
+
max_sequence_length=max_sequence_length,
|
835 |
+
mix_attention=mix_attention_single,
|
836 |
+
cond_temb = cond_temb if cond_input is not None else None,
|
837 |
+
cond_image_rotary_emb = cond_rotary_emb if cond_input is not None else None,
|
838 |
+
cond_latents = cond_latents if cond_input is not None else None,
|
839 |
+
joint_attention_kwargs=joint_attention_kwargs,
|
840 |
+
)
|
841 |
+
|
842 |
+
if cond_input is not None and mix_attention_single:
|
843 |
+
hidden_states, cond_latents = hidden_states
|
844 |
+
|
845 |
+
# controlnet residual
|
846 |
+
if controlnet_single_block_samples is not None:
|
847 |
+
interval_control = len(self.single_transformer_blocks) / len(controlnet_single_block_samples)
|
848 |
+
interval_control = int(np.ceil(interval_control))
|
849 |
+
hidden_states[:, encoder_hidden_states.shape[1]:, ...] = (
|
850 |
+
hidden_states[:, encoder_hidden_states.shape[1]:, ...]
|
851 |
+
+ controlnet_single_block_samples[index_block // interval_control]
|
852 |
+
)
|
853 |
+
|
854 |
+
hidden_states = hidden_states[:, encoder_hidden_states.shape[1]:, ...]
|
855 |
+
|
856 |
+
hidden_states = self.norm_out(hidden_states, temb)
|
857 |
+
output = self.proj_out(hidden_states)
|
858 |
+
|
859 |
+
if USE_PEFT_BACKEND:
|
860 |
+
# remove `lora_scale` from each PEFT layer
|
861 |
+
unscale_lora_layers(self, lora_scale)
|
862 |
+
|
863 |
+
if not return_dict:
|
864 |
+
return (output,)
|
865 |
+
|
866 |
+
return Transformer2DModelOutput(sample=output)
|
dreamfuse/trains/utils/__pycache__/inference_utils.cpython-310.pyc
ADDED
Binary file (8.68 kB). View file
|
|
dreamfuse/trains/utils/inference_utils.py
ADDED
@@ -0,0 +1,386 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from diffusers.utils.torch_utils import randn_tensor
|
3 |
+
import numpy as np
|
4 |
+
from einops import rearrange
|
5 |
+
import torch.nn.functional as F
|
6 |
+
|
7 |
+
def get_mask_affine(mask1, mask2):
|
8 |
+
box1 = mask1.getbbox()
|
9 |
+
box2 = mask2.getbbox()
|
10 |
+
|
11 |
+
if box1 is None or box2 is None:
|
12 |
+
affine_coeffs = [1, 0, 0, 0, 1, 0]
|
13 |
+
return affine_coeffs
|
14 |
+
|
15 |
+
left1, top1, right1, bottom1 = box1
|
16 |
+
left2, top2, right2, bottom2 = box2
|
17 |
+
|
18 |
+
w1, h1 = right1 - left1, bottom1 - top1
|
19 |
+
w2, h2 = right2 - left2, bottom2 - top2
|
20 |
+
|
21 |
+
scale_x = w1 / w2
|
22 |
+
scale_y = h1 / h2
|
23 |
+
|
24 |
+
tx = left1 - left2*scale_x
|
25 |
+
ty = top1 - top2*scale_y
|
26 |
+
|
27 |
+
affine_coeffs = [scale_x, 0, tx, 0, scale_y, ty]
|
28 |
+
return affine_coeffs
|
29 |
+
|
30 |
+
def tokenize_prompt(tokenizer, prompt, max_sequence_length):
|
31 |
+
text_inputs = tokenizer(
|
32 |
+
prompt,
|
33 |
+
padding="max_length",
|
34 |
+
max_length=max_sequence_length,
|
35 |
+
truncation=True,
|
36 |
+
return_length=False,
|
37 |
+
return_overflowing_tokens=False,
|
38 |
+
return_tensors="pt",
|
39 |
+
)
|
40 |
+
text_input_ids = text_inputs.input_ids
|
41 |
+
return text_input_ids
|
42 |
+
|
43 |
+
|
44 |
+
def _encode_prompt_with_t5(
|
45 |
+
text_encoder,
|
46 |
+
tokenizer,
|
47 |
+
max_sequence_length=512,
|
48 |
+
prompt=None,
|
49 |
+
num_images_per_prompt=1,
|
50 |
+
device=None,
|
51 |
+
text_input_ids=None,
|
52 |
+
):
|
53 |
+
prompt = [prompt] if isinstance(prompt, str) else prompt
|
54 |
+
batch_size = len(prompt)
|
55 |
+
|
56 |
+
if tokenizer is not None:
|
57 |
+
text_inputs = tokenizer(
|
58 |
+
prompt,
|
59 |
+
padding="max_length",
|
60 |
+
max_length=max_sequence_length,
|
61 |
+
truncation=True,
|
62 |
+
return_length=False,
|
63 |
+
return_overflowing_tokens=False,
|
64 |
+
return_tensors="pt",
|
65 |
+
)
|
66 |
+
text_input_ids = text_inputs.input_ids
|
67 |
+
else:
|
68 |
+
if text_input_ids is None:
|
69 |
+
raise ValueError("text_input_ids must be provided when the tokenizer is not specified")
|
70 |
+
prompt_embeds = text_encoder(text_input_ids.to(device))[0]
|
71 |
+
|
72 |
+
dtype = text_encoder.dtype
|
73 |
+
prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
|
74 |
+
|
75 |
+
_, seq_len, _ = prompt_embeds.shape
|
76 |
+
|
77 |
+
# duplicate text embeddings and attention mask for each generation per prompt, using mps friendly method
|
78 |
+
prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
|
79 |
+
prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
|
80 |
+
|
81 |
+
return prompt_embeds
|
82 |
+
|
83 |
+
|
84 |
+
def _encode_prompt_with_clip(
|
85 |
+
text_encoder,
|
86 |
+
tokenizer,
|
87 |
+
prompt: str,
|
88 |
+
device=None,
|
89 |
+
text_input_ids=None,
|
90 |
+
num_images_per_prompt: int = 1,
|
91 |
+
):
|
92 |
+
prompt = [prompt] if isinstance(prompt, str) else prompt
|
93 |
+
batch_size = len(prompt)
|
94 |
+
|
95 |
+
if tokenizer is not None:
|
96 |
+
text_inputs = tokenizer(
|
97 |
+
prompt,
|
98 |
+
padding="max_length",
|
99 |
+
max_length=77,
|
100 |
+
truncation=True,
|
101 |
+
return_overflowing_tokens=False,
|
102 |
+
return_length=False,
|
103 |
+
return_tensors="pt",
|
104 |
+
)
|
105 |
+
|
106 |
+
text_input_ids = text_inputs.input_ids
|
107 |
+
else:
|
108 |
+
if text_input_ids is None:
|
109 |
+
raise ValueError("text_input_ids must be provided when the tokenizer is not specified")
|
110 |
+
|
111 |
+
prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=False)
|
112 |
+
|
113 |
+
# Use pooled output of CLIPTextModel
|
114 |
+
prompt_embeds = prompt_embeds.pooler_output
|
115 |
+
prompt_embeds = prompt_embeds.to(dtype=text_encoder.dtype, device=device)
|
116 |
+
|
117 |
+
# duplicate text embeddings for each generation per prompt, using mps friendly method
|
118 |
+
prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
|
119 |
+
prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, -1)
|
120 |
+
|
121 |
+
return prompt_embeds
|
122 |
+
|
123 |
+
|
124 |
+
def compute_text_embeddings(config, prompt, text_encoders, tokenizers, device):
|
125 |
+
with torch.no_grad():
|
126 |
+
prompt_embeds, pooled_prompt_embeds, text_ids = encode_prompt(
|
127 |
+
text_encoders, tokenizers, prompt, config.max_sequence_length
|
128 |
+
)
|
129 |
+
prompt_embeds = prompt_embeds.to(device)
|
130 |
+
pooled_prompt_embeds = pooled_prompt_embeds.to(device)
|
131 |
+
text_ids = text_ids.to(device)
|
132 |
+
return prompt_embeds, pooled_prompt_embeds, text_ids
|
133 |
+
|
134 |
+
|
135 |
+
def _prepare_image_ids(height, width, offset_h=0, offset_w=0):
|
136 |
+
image_ids = torch.zeros(height, width, 3)
|
137 |
+
image_ids[..., 1] = image_ids[..., 1] + torch.arange(height)[:, None] + offset_h
|
138 |
+
image_ids[..., 2] = image_ids[..., 2] + torch.arange(width)[None, :] + offset_w
|
139 |
+
image_ids = image_ids.reshape(-1, 3)
|
140 |
+
return image_ids
|
141 |
+
|
142 |
+
|
143 |
+
def _pack_latents(latents, batch_size, num_channels_latents, height, width):
|
144 |
+
latents = latents.view(
|
145 |
+
batch_size, num_channels_latents, height // 2, 2, width // 2, 2
|
146 |
+
)
|
147 |
+
latents = latents.permute(0, 2, 4, 1, 3, 5)
|
148 |
+
latents = latents.reshape(
|
149 |
+
batch_size, (height // 2) * (width // 2), num_channels_latents * 4
|
150 |
+
)
|
151 |
+
|
152 |
+
return latents
|
153 |
+
|
154 |
+
def _unpack_latents(latents, height, width, vae_downsample_factor):
|
155 |
+
batch_size, num_patches, channels = latents.shape
|
156 |
+
|
157 |
+
# VAE applies 8x compression on images but we must also account for packing which requires
|
158 |
+
# latent height and width to be divisible by 2.
|
159 |
+
height = 2 * (int(height) // (vae_downsample_factor * 2))
|
160 |
+
width = 2 * (int(width) // (vae_downsample_factor * 2))
|
161 |
+
|
162 |
+
latents = latents.view(batch_size, height // 2, width // 2, channels // 4, 2, 2)
|
163 |
+
latents = latents.permute(0, 3, 1, 4, 2, 5)
|
164 |
+
|
165 |
+
latents = latents.reshape(batch_size, channels // (2 * 2), height, width)
|
166 |
+
|
167 |
+
return latents
|
168 |
+
|
169 |
+
|
170 |
+
def _prepare_latent_image_ids(batch_size, height, width, device, dtype, offset_h=0, offset_w=0):
|
171 |
+
latent_image_ids = torch.zeros(height, width, 3)
|
172 |
+
latent_image_ids[..., 1] = (
|
173 |
+
latent_image_ids[..., 1] + torch.arange(height)[:, None] + offset_h
|
174 |
+
)
|
175 |
+
latent_image_ids[..., 2] = (
|
176 |
+
latent_image_ids[..., 2] + torch.arange(width)[None, :] + offset_w
|
177 |
+
)
|
178 |
+
|
179 |
+
latent_image_id_height, latent_image_id_width, latent_image_id_channels = (
|
180 |
+
latent_image_ids.shape
|
181 |
+
)
|
182 |
+
|
183 |
+
latent_image_ids = latent_image_ids.reshape(
|
184 |
+
latent_image_id_height * latent_image_id_width, latent_image_id_channels
|
185 |
+
)
|
186 |
+
|
187 |
+
return latent_image_ids.to(device=device, dtype=dtype)
|
188 |
+
|
189 |
+
|
190 |
+
def pil_to_tensor(image, device="cpu"):
|
191 |
+
image = np.array(image)
|
192 |
+
image = torch.from_numpy(image).float() / 127.5 - 1.0
|
193 |
+
image = image.permute(2, 0, 1).to(device)
|
194 |
+
return image
|
195 |
+
|
196 |
+
@torch.no_grad()
|
197 |
+
def encode_images_cond(vae_model, condition_images, device):
|
198 |
+
condition_image_tensors = []
|
199 |
+
for condition_image in condition_images:
|
200 |
+
condition_image_tensor = torch.tensor(np.array(condition_image)).to(device).permute(0, 3, 1, 2) # shape: [n_cond, c, h, w]
|
201 |
+
condition_image_tensor = condition_image_tensor / 127.5 - 1.0
|
202 |
+
condition_image_tensors.append(condition_image_tensor)
|
203 |
+
condition_image_tensors = torch.stack(condition_image_tensors) # shape: [bs, n_cond, c, h, w]
|
204 |
+
condition_image_tensors = rearrange(condition_image_tensors, 'b n c h w -> (b n) c h w')
|
205 |
+
|
206 |
+
# encode condition images
|
207 |
+
condition_image_latents = (
|
208 |
+
vae_model.encode(
|
209 |
+
condition_image_tensors.to(vae_model.dtype)
|
210 |
+
).latent_dist.sample()
|
211 |
+
) # shape: [bs*n_cond, c, h // 8, w // 8]
|
212 |
+
condition_image_latents = (condition_image_latents - vae_model.config.shift_factor) * vae_model.config.scaling_factor
|
213 |
+
|
214 |
+
return condition_image_latents
|
215 |
+
|
216 |
+
|
217 |
+
def prepare_latents(
|
218 |
+
batch_size,
|
219 |
+
num_channels_latents,
|
220 |
+
vae_downsample_factor,
|
221 |
+
height,
|
222 |
+
width,
|
223 |
+
dtype,
|
224 |
+
device,
|
225 |
+
generator,
|
226 |
+
latents=None,
|
227 |
+
offset=None,
|
228 |
+
hw=False,
|
229 |
+
):
|
230 |
+
# VAE applies 8x compression on images but we must also account for packing which requires
|
231 |
+
# latent height and width to be divisible by 2.
|
232 |
+
height = 2 * (int(height) // (vae_downsample_factor * 2))
|
233 |
+
width = 2 * (int(width) // (vae_downsample_factor * 2))
|
234 |
+
|
235 |
+
shape = (batch_size, num_channels_latents, height, width)
|
236 |
+
|
237 |
+
if latents is not None:
|
238 |
+
if offset is None:
|
239 |
+
latent_image_ids = _prepare_latent_image_ids(
|
240 |
+
batch_size, height // 2, width // 2, device, dtype
|
241 |
+
)
|
242 |
+
else:
|
243 |
+
latent_image_ids = []
|
244 |
+
for offset_ in offset:
|
245 |
+
latent_image_ids.append(
|
246 |
+
_prepare_latent_image_ids(
|
247 |
+
batch_size, height // 2, width // 2, device, dtype, offset_w=offset_ * width // 2, offset_h=offset_ * height // 2 if hw else 0
|
248 |
+
)
|
249 |
+
)
|
250 |
+
return latents.to(device=device, dtype=dtype), latent_image_ids
|
251 |
+
|
252 |
+
if isinstance(generator, list) and len(generator) != batch_size:
|
253 |
+
raise ValueError(
|
254 |
+
f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
|
255 |
+
f" size of {batch_size}. Make sure the batch size matches the length of the generators."
|
256 |
+
)
|
257 |
+
|
258 |
+
latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
|
259 |
+
latents = _pack_latents(
|
260 |
+
latents, batch_size, num_channels_latents, height, width
|
261 |
+
)
|
262 |
+
if offset is None:
|
263 |
+
latent_image_ids = _prepare_latent_image_ids(
|
264 |
+
batch_size, height // 2, width // 2, device, dtype
|
265 |
+
)
|
266 |
+
else:
|
267 |
+
latent_image_ids = []
|
268 |
+
for offset_ in offset:
|
269 |
+
latent_image_ids.append(
|
270 |
+
_prepare_latent_image_ids(
|
271 |
+
batch_size, height // 2, width // 2, device, dtype, offset_w=offset_ * width // 2, offset_h=offset_ * height // 2 if hw else 0
|
272 |
+
)
|
273 |
+
)
|
274 |
+
return latents, latent_image_ids
|
275 |
+
|
276 |
+
|
277 |
+
@torch.no_grad()
|
278 |
+
def encode_prompt(
|
279 |
+
text_encoders,
|
280 |
+
tokenizers,
|
281 |
+
prompt: str,
|
282 |
+
max_sequence_length,
|
283 |
+
device=None,
|
284 |
+
num_images_per_prompt: int = 1,
|
285 |
+
text_input_ids_list=None,
|
286 |
+
):
|
287 |
+
prompt = [prompt] if isinstance(prompt, str) else prompt
|
288 |
+
dtype = text_encoders[0].dtype
|
289 |
+
|
290 |
+
pooled_prompt_embeds = _encode_prompt_with_clip(
|
291 |
+
text_encoder=text_encoders[0],
|
292 |
+
tokenizer=tokenizers[0],
|
293 |
+
prompt=prompt,
|
294 |
+
device=device if device is not None else text_encoders[0].device,
|
295 |
+
num_images_per_prompt=num_images_per_prompt,
|
296 |
+
text_input_ids=text_input_ids_list[0] if text_input_ids_list else None,
|
297 |
+
)
|
298 |
+
|
299 |
+
prompt_embeds = _encode_prompt_with_t5(
|
300 |
+
text_encoder=text_encoders[1],
|
301 |
+
tokenizer=tokenizers[1],
|
302 |
+
max_sequence_length=max_sequence_length,
|
303 |
+
prompt=prompt,
|
304 |
+
num_images_per_prompt=num_images_per_prompt,
|
305 |
+
device=device if device is not None else text_encoders[1].device,
|
306 |
+
text_input_ids=text_input_ids_list[1] if text_input_ids_list else None,
|
307 |
+
)
|
308 |
+
|
309 |
+
text_ids = torch.zeros(prompt_embeds.shape[1], 3).to(device=device, dtype=dtype)
|
310 |
+
|
311 |
+
return prompt_embeds, pooled_prompt_embeds, text_ids
|
312 |
+
|
313 |
+
def warp_affine_tensor(input_tensor, mask_affines, output_size, scale_factor=1/16,
|
314 |
+
align_corners_grid=False, align_corners_sample=True,
|
315 |
+
flatten_output=True, device=None):
|
316 |
+
"""
|
317 |
+
对输入的 tensor 应用 affine 仿射变换,并返回 warp 后的结果。
|
318 |
+
|
319 |
+
参数:
|
320 |
+
input_tensor: 待变换的图像 tensor,支持的形状包括 (H, W, C)、(C, H, W) 或 (1, C, H, W)。
|
321 |
+
mask_affines: 仿射参数(例如 [a, 0, tₓ, 0, e, t_y]),这些参数单位基于 512×512 图像。
|
322 |
+
output_size: 目标输出的空间尺寸,格式为 (H_out, W_out)。
|
323 |
+
scale_factor: 平移参数的缩放因子;例如若 512→32,则 factor = 32/512 = 1/16。
|
324 |
+
align_corners_grid: 传递给 F.affine_grid 的 align_corners 参数。
|
325 |
+
align_corners_sample: 传递给 F.grid_sample 的 align_corners 参数。
|
326 |
+
flatten_output: 若为 True,则将输出 warp 后的 tensor 从 (1, C, H_out, W_out) 转换为 (-1, C)。
|
327 |
+
device: 如果设置,将将相关 tensor 移动到指定的设备上。
|
328 |
+
|
329 |
+
返回:
|
330 |
+
warped_output: 经过 affine warp 处理后的 tensor,
|
331 |
+
若 flatten_output 为 True,则形状为 (H_out*W_out, C),否则为 (1, C, H_out, W_out)。
|
332 |
+
"""
|
333 |
+
# 如果输入 tensor 不是 batch(4D)的,则调整为 (1, C, H, W)
|
334 |
+
if input_tensor.dim() == 3:
|
335 |
+
# 判断是否为 (H, W, C),如果最后一维为 3,则认为是 RGB
|
336 |
+
if input_tensor.shape[-1] == 3:
|
337 |
+
input_tensor = input_tensor.permute(2, 0, 1)
|
338 |
+
input_tensor = input_tensor.unsqueeze(0)
|
339 |
+
elif input_tensor.dim() != 4:
|
340 |
+
raise ValueError("input_tensor 必须是 3D 或 4D Tensor!")
|
341 |
+
|
342 |
+
# 输出尺寸
|
343 |
+
H_out, W_out = output_size
|
344 |
+
B, C, H_in, W_in = input_tensor.shape
|
345 |
+
|
346 |
+
# 将 mask_affines 转换为 tensor,确保形状为 (1, 6)
|
347 |
+
if not torch.is_tensor(mask_affines):
|
348 |
+
theta = torch.tensor(mask_affines, dtype=torch.float32).unsqueeze(0)
|
349 |
+
else:
|
350 |
+
theta = mask_affines.clone().float()
|
351 |
+
if theta.dim() == 1:
|
352 |
+
theta = theta.unsqueeze(0)
|
353 |
+
|
354 |
+
# 调整平移部分(第三和第六个元素),使其适应当前目标分辨率
|
355 |
+
theta[0, 2] *= scale_factor # x 方向平移
|
356 |
+
theta[0, 5] *= scale_factor # y 方向平移
|
357 |
+
|
358 |
+
a = theta[0, 0]
|
359 |
+
t_x = theta[0, 2]
|
360 |
+
e = theta[0, 4]
|
361 |
+
t_y = theta[0, 5]
|
362 |
+
|
363 |
+
# 根据归一化转换(范围 [-1, 1])
|
364 |
+
# 对 x 方向:归一化公式为 x_norm = 2*x/(W_out-1) - 1
|
365 |
+
# 转换后 affine 的常数项即为:a + 2*t_x/(W_out-1) - 1
|
366 |
+
theta_norm = torch.tensor([
|
367 |
+
[a, 0.0, a + 2*t_x/(W_out - 1) - 1],
|
368 |
+
[0.0, e, e + 2*t_y/(H_out - 1) - 1]
|
369 |
+
], dtype=torch.float32).unsqueeze(0)
|
370 |
+
|
371 |
+
# 根据目标输出大小创建 affine_grid,grid 的 size 为 (B, C, H_out, W_out)
|
372 |
+
grid = F.affine_grid(theta_norm, size=(B, C, H_out, W_out), align_corners=align_corners_grid)
|
373 |
+
if device is not None:
|
374 |
+
grid = grid.to(device)
|
375 |
+
input_tensor = input_tensor.to(device)
|
376 |
+
|
377 |
+
# 对输入 tensor 进行采样
|
378 |
+
warped = F.grid_sample(input_tensor, grid, align_corners=align_corners_sample)
|
379 |
+
|
380 |
+
# 若需要将输出展平为 (-1, C)
|
381 |
+
if flatten_output:
|
382 |
+
# 将 (1, C, H_out, W_out) → 转为 (H_out, W_out, C) → reshape(-1, C)
|
383 |
+
warped = warped.squeeze(0).permute(1, 2, 0).reshape(-1, C)
|
384 |
+
return warped
|
385 |
+
|
386 |
+
|
dreamfuse_inference.py
ADDED
@@ -0,0 +1,642 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gc
|
2 |
+
import os
|
3 |
+
from typing import List
|
4 |
+
import contextlib
|
5 |
+
import torch.multiprocessing as mp
|
6 |
+
from dataclasses import dataclass, field
|
7 |
+
from collections import defaultdict
|
8 |
+
import random
|
9 |
+
import numpy as np
|
10 |
+
from PIL import Image, ImageOps
|
11 |
+
import json
|
12 |
+
import torch
|
13 |
+
from peft import PeftModel
|
14 |
+
import torch.nn.functional as F
|
15 |
+
import accelerate
|
16 |
+
import diffusers
|
17 |
+
from diffusers import FluxPipeline
|
18 |
+
from diffusers.utils.torch_utils import is_compiled_module
|
19 |
+
import transformers
|
20 |
+
from tqdm import tqdm
|
21 |
+
from peft import LoraConfig, set_peft_model_state_dict
|
22 |
+
from peft.utils import get_peft_model_state_dict
|
23 |
+
from dreamfuse.models.dreamfuse_flux.transformer import (
|
24 |
+
FluxTransformer2DModel,
|
25 |
+
FluxTransformerBlock,
|
26 |
+
FluxSingleTransformerBlock,
|
27 |
+
)
|
28 |
+
from diffusers.schedulers.scheduling_flow_match_euler_discrete import (
|
29 |
+
FlowMatchEulerDiscreteScheduler,
|
30 |
+
)
|
31 |
+
from diffusers.pipelines.flux.pipeline_flux import calculate_shift, retrieve_timesteps
|
32 |
+
from dreamfuse.trains.utils.inference_utils import (
|
33 |
+
compute_text_embeddings,
|
34 |
+
prepare_latents,
|
35 |
+
_unpack_latents,
|
36 |
+
_pack_latents,
|
37 |
+
_prepare_image_ids,
|
38 |
+
encode_images_cond,
|
39 |
+
get_mask_affine,
|
40 |
+
warp_affine_tensor
|
41 |
+
)
|
42 |
+
|
43 |
+
|
44 |
+
def seed_everything(seed):
|
45 |
+
torch.manual_seed(seed)
|
46 |
+
torch.cuda.manual_seed(seed)
|
47 |
+
random.seed(seed)
|
48 |
+
np.random.seed(seed)
|
49 |
+
|
50 |
+
@dataclass
|
51 |
+
class InferenceConfig:
|
52 |
+
# Model paths
|
53 |
+
flux_model_id: str = 'black-forest-labs/FLUX.1-dev'
|
54 |
+
|
55 |
+
lora_id: str = ''
|
56 |
+
model_choice: str = 'dev'
|
57 |
+
# Model configs
|
58 |
+
lora_rank: int = 16
|
59 |
+
max_sequence_length: int = 256
|
60 |
+
guidance_scale: float = 3.5
|
61 |
+
num_inference_steps: int = 28
|
62 |
+
mask_ids: int = 16
|
63 |
+
mask_in_chans: int = 128
|
64 |
+
mask_out_chans: int = 3072
|
65 |
+
inference_scale = 1024
|
66 |
+
|
67 |
+
# Training configs
|
68 |
+
gradient_checkpointing: bool = False
|
69 |
+
mix_attention_double: bool = True
|
70 |
+
mix_attention_single: bool = True
|
71 |
+
|
72 |
+
# Image processing
|
73 |
+
image_ids_offset: List[int] = field(default_factory=lambda: [0, 0, 0])
|
74 |
+
image_tags: List[int] = field(default_factory=lambda: [0, 1, 2])
|
75 |
+
context_tags: List[int] = None
|
76 |
+
|
77 |
+
# Runtime configs
|
78 |
+
device: str = "cuda:0" # if torch.cuda.is_available() else "cpu"
|
79 |
+
dtype: torch.dtype = torch.bfloat16
|
80 |
+
seed: int = 1234
|
81 |
+
debug: bool = True
|
82 |
+
|
83 |
+
# I/O configs
|
84 |
+
valid_output_dir: str = "./inference_output"
|
85 |
+
valid_roots: List[str] = field(default_factory=lambda: [
|
86 |
+
"./",
|
87 |
+
])
|
88 |
+
valid_jsons: List[str] = field(default_factory=lambda: [
|
89 |
+
"./examples/data_dreamfuse.json",
|
90 |
+
])
|
91 |
+
ref_prompts: str = ""
|
92 |
+
|
93 |
+
truecfg: bool = False
|
94 |
+
text_strength: int = 5
|
95 |
+
|
96 |
+
# multi gpu
|
97 |
+
sub_idx:int = 0
|
98 |
+
total_num:int = 1
|
99 |
+
|
100 |
+
def adjust_fg_to_bg(image: Image.Image, mask: Image.Image, target_size: tuple) -> tuple[Image.Image, Image.Image]:
|
101 |
+
width, height = image.size
|
102 |
+
target_w, target_h = target_size
|
103 |
+
|
104 |
+
scale = min(target_w / width, target_h / height)
|
105 |
+
if scale < 1:
|
106 |
+
new_w = int(width * scale)
|
107 |
+
new_h = int(height * scale)
|
108 |
+
image = image.resize((new_w, new_h))
|
109 |
+
mask = mask.resize((new_w, new_h))
|
110 |
+
width, height = new_w, new_h
|
111 |
+
|
112 |
+
pad_w = target_w - width
|
113 |
+
pad_h = target_h - height
|
114 |
+
padding = (
|
115 |
+
pad_w // 2, # left
|
116 |
+
pad_h // 2, # top
|
117 |
+
(pad_w + 1) // 2, # right
|
118 |
+
(pad_h + 1) // 2 # bottom
|
119 |
+
)
|
120 |
+
|
121 |
+
image = ImageOps.expand(image, border=padding, fill=(255, 255, 255))
|
122 |
+
mask = ImageOps.expand(mask, border=padding, fill=0)
|
123 |
+
|
124 |
+
return image, mask
|
125 |
+
|
126 |
+
def find_nearest_bucket_size(input_width, input_height, mode="x64", bucket_size=1024):
|
127 |
+
"""
|
128 |
+
Finds the nearest bucket size for the given input size.
|
129 |
+
"""
|
130 |
+
buckets = {
|
131 |
+
512: [[ 256, 768 ], [ 320, 768 ], [ 320, 704 ], [ 384, 640 ], [ 448, 576 ], [ 512, 512 ], [ 576, 448 ], [ 640, 384 ], [ 704, 320 ], [ 768, 320 ], [ 768, 256 ]],
|
132 |
+
768: [[ 384, 1152 ], [ 480, 1152 ], [ 480, 1056 ], [ 576, 960 ], [ 672, 864 ], [ 768, 768 ], [ 864, 672 ], [ 960, 576 ], [ 1056, 480 ], [ 1152, 480 ], [ 1152, 384 ]],
|
133 |
+
1024: [[ 512, 1536 ], [ 640, 1536 ], [ 640, 1408 ], [ 768, 1280 ], [ 896, 1152 ], [ 1024, 1024 ], [ 1152, 896 ], [ 1280, 768 ], [ 1408, 640 ], [ 1536, 640 ], [ 1536, 512 ]]
|
134 |
+
}
|
135 |
+
|
136 |
+
buckets = buckets[bucket_size]
|
137 |
+
|
138 |
+
aspect_ratios = [w / h for (w, h) in buckets]
|
139 |
+
assert mode in ["x64", "x8"]
|
140 |
+
if mode == "x64":
|
141 |
+
asp = input_width / input_height
|
142 |
+
diff = [abs(ar - asp) for ar in aspect_ratios]
|
143 |
+
bucket_id = int(np.argmin(diff))
|
144 |
+
gen_width, gen_height = buckets[bucket_id]
|
145 |
+
elif mode == "x8":
|
146 |
+
max_pixels = 1024 * 1024
|
147 |
+
ratio = (max_pixels / (input_width * input_height)) ** (0.5)
|
148 |
+
gen_width, gen_height = round(input_width * ratio), round(input_height * ratio)
|
149 |
+
gen_width = gen_width - gen_width % 8
|
150 |
+
gen_height = gen_height - gen_height % 8
|
151 |
+
else:
|
152 |
+
raise NotImplementedError
|
153 |
+
return (gen_width, gen_height)
|
154 |
+
|
155 |
+
def make_image_grid(images, rows, cols, size=None):
|
156 |
+
assert len(images) == rows * cols
|
157 |
+
|
158 |
+
if size is not None:
|
159 |
+
images = [img.resize((size[0], size[1])) for img in images]
|
160 |
+
|
161 |
+
w, h = images[0].size
|
162 |
+
grid = Image.new("RGB", size=(cols * w, rows * h))
|
163 |
+
|
164 |
+
for i, img in enumerate(images):
|
165 |
+
grid.paste(img.convert("RGB"), box=(i % cols * w, i // cols * h))
|
166 |
+
return grid
|
167 |
+
|
168 |
+
class DreamFuseInference:
|
169 |
+
def __init__(self, config: InferenceConfig):
|
170 |
+
self.config = config
|
171 |
+
print(config.device)
|
172 |
+
self.device = torch.device(config.device)
|
173 |
+
torch.backends.cuda.matmul.allow_tf32 = True
|
174 |
+
seed_everything(config.seed)
|
175 |
+
self._init_models()
|
176 |
+
|
177 |
+
def _init_models(self):
|
178 |
+
# Initialize tokenizers
|
179 |
+
self.tokenizer_one = transformers.CLIPTokenizer.from_pretrained(
|
180 |
+
self.config.flux_model_id, subfolder="tokenizer"
|
181 |
+
)
|
182 |
+
self.tokenizer_two = transformers.T5TokenizerFast.from_pretrained(
|
183 |
+
self.config.flux_model_id, subfolder="tokenizer_2"
|
184 |
+
)
|
185 |
+
|
186 |
+
# Initialize text encoders
|
187 |
+
self.text_encoder_one = transformers.CLIPTextModel.from_pretrained(
|
188 |
+
self.config.flux_model_id, subfolder="text_encoder"
|
189 |
+
).to(device=self.device, dtype=self.config.dtype)
|
190 |
+
self.text_encoder_two = transformers.T5EncoderModel.from_pretrained(
|
191 |
+
self.config.flux_model_id, subfolder="text_encoder_2"
|
192 |
+
).to(device=self.device, dtype=self.config.dtype)
|
193 |
+
|
194 |
+
# Initialize VAE
|
195 |
+
self.vae = diffusers.AutoencoderKL.from_pretrained(
|
196 |
+
self.config.flux_model_id, subfolder="vae"
|
197 |
+
).to(device=self.device, dtype=self.config.dtype)
|
198 |
+
|
199 |
+
# Initialize denoising model
|
200 |
+
self.denoise_model = FluxTransformer2DModel.from_pretrained(
|
201 |
+
self.config.flux_model_id, subfolder="transformer"
|
202 |
+
).to(device=self.device, dtype=self.config.dtype)
|
203 |
+
|
204 |
+
if self.config.image_tags is not None or self.config.context_tags is not None:
|
205 |
+
num_image_tag_embeddings = max(self.config.image_tags) + 1 if self.config.image_tags is not None else 0
|
206 |
+
num_context_tag_embeddings = max(self.config.context_tags) + 1 if self.config.context_tags is not None else 0
|
207 |
+
self.denoise_model.set_tag_embeddings(
|
208 |
+
num_image_tag_embeddings=num_image_tag_embeddings,
|
209 |
+
num_context_tag_embeddings=num_context_tag_embeddings,
|
210 |
+
)
|
211 |
+
|
212 |
+
# Add LoRA
|
213 |
+
self.denoise_model = PeftModel.from_pretrained(
|
214 |
+
self.denoise_model,
|
215 |
+
self.config.lora_id,
|
216 |
+
adapter_weights=[1.0],
|
217 |
+
device_map={"": self.device}
|
218 |
+
)
|
219 |
+
|
220 |
+
# Initialize scheduler
|
221 |
+
self.scheduler = FlowMatchEulerDiscreteScheduler.from_pretrained(
|
222 |
+
self.config.flux_model_id, subfolder="scheduler"
|
223 |
+
)
|
224 |
+
|
225 |
+
# Set models to eval mode
|
226 |
+
for model in [self.text_encoder_one, self.text_encoder_two, self.vae, self.denoise_model]:
|
227 |
+
model.eval()
|
228 |
+
model.requires_grad_(False)
|
229 |
+
|
230 |
+
def _compute_text_embeddings(self, prompt):
|
231 |
+
return compute_text_embeddings(
|
232 |
+
self.config,
|
233 |
+
prompt,
|
234 |
+
[self.text_encoder_one, self.text_encoder_two],
|
235 |
+
[self.tokenizer_one, self.tokenizer_two],
|
236 |
+
self.device
|
237 |
+
)
|
238 |
+
|
239 |
+
def resize_to_fit_within(self, reference_image, target_image):
|
240 |
+
ref_width, ref_height = reference_image.size
|
241 |
+
target_width, target_height = target_image.size
|
242 |
+
|
243 |
+
scale_width = ref_width / target_width
|
244 |
+
scale_height = ref_height / target_height
|
245 |
+
scale = min(scale_width, scale_height) # 选择最小的缩放比例,确保不超出参考图片的宽高
|
246 |
+
|
247 |
+
new_width = int(target_width * scale)
|
248 |
+
new_height = int(target_height * scale)
|
249 |
+
|
250 |
+
resized_image = target_image.resize((new_width, new_height), Image.LANCZOS)
|
251 |
+
return resized_image
|
252 |
+
|
253 |
+
def pad_or_crop(self, img, target_size, fill_color=(255, 255, 255)):
|
254 |
+
"""
|
255 |
+
将输入图像按中心对齐,裁剪或填充到 target_size 大小。
|
256 |
+
|
257 |
+
参数:
|
258 |
+
img - PIL.Image 对象
|
259 |
+
target_size - 目标尺寸 (width, height)
|
260 |
+
fill_color - 填充颜色,默认为白色
|
261 |
+
|
262 |
+
返回:
|
263 |
+
调整后的 PIL.Image 对象,尺寸为 target_size
|
264 |
+
"""
|
265 |
+
iw, ih = img.size
|
266 |
+
tw, th = target_size
|
267 |
+
|
268 |
+
# 计算裁剪区域:若原图大于目标尺寸,则裁剪出中间部分;否则全部保留
|
269 |
+
left = (iw - tw) // 2 if iw >= tw else 0
|
270 |
+
top = (ih - th) // 2 if ih >= th else 0
|
271 |
+
cropped = img.crop((left, top, left + min(iw, tw), top + min(ih, th)))
|
272 |
+
|
273 |
+
# 新建目标尺寸的图像,并将裁剪后的图像居中粘贴
|
274 |
+
new_img = Image.new(img.mode, target_size, fill_color)
|
275 |
+
offset = ((tw - cropped.width) // 2, (th - cropped.height) // 2)
|
276 |
+
new_img.paste(cropped, offset)
|
277 |
+
|
278 |
+
return new_img
|
279 |
+
|
280 |
+
def transform_foreground_original(self, original_fg, original_bg, transformation_info, canvas_size=400):
|
281 |
+
"""
|
282 |
+
根据 transformation_info 中的信息对原始前景图(original_fg)进行平移处理,
|
283 |
+
要求:
|
284 |
+
1. 输出图像大小与 original_fg 相同(保持原始前景图大小);
|
285 |
+
2. 位移计算时,还原为未缩放的拖拽坐标,即用 drag_left/drag_top 除以 scale_ratio;
|
286 |
+
3. 拖拽产生的相对位移比例在 400x400 预览画布下相对于未缩放时默认(居中)位置计算,
|
287 |
+
然后按此比例推算到原始前景图尺寸下的实际位移(像素数)。
|
288 |
+
4. 结果在原始前景图大小的白底(未覆盖区域填充白色)中粘贴前景图。
|
289 |
+
|
290 |
+
参数:
|
291 |
+
original_fg: 原始上传的前景图(PIL Image 对象)
|
292 |
+
transformation_info: 字典,必须包含以下字段:
|
293 |
+
- "drag_left": 拖拽后当前显示的前景图左上角横坐标(受缩放影响,单位像素)
|
294 |
+
- "drag_top": 拖拽后当前显示的前景图左上角纵坐标(受缩放影响,单位像素)
|
295 |
+
- "scale_ratio": 预览时前景图缩放比例
|
296 |
+
- "data_original_width": 前景图在预览中未缩放时的宽度
|
297 |
+
- "data_original_height": 前景图在预览中未缩放时的高度
|
298 |
+
canvas_size: 预览画布尺寸(默认400,与前端保持一致)
|
299 |
+
|
300 |
+
返回:
|
301 |
+
处理后的图像(PIL Image 对象),大小与 original_fg 相同,
|
302 |
+
并根据未缩放时拖拽的相对位移结果进行了平移。
|
303 |
+
"""
|
304 |
+
# 读取 transformation_info 中的参数
|
305 |
+
drag_left = float(transformation_info.get("drag_left", 0))
|
306 |
+
drag_top = float(transformation_info.get("drag_top", 0))
|
307 |
+
scale_ratio = float(transformation_info.get("scale_ratio", 1))
|
308 |
+
data_orig_width = float(transformation_info.get("data_original_width", canvas_size))
|
309 |
+
data_orig_height = float(transformation_info.get("data_original_height", canvas_size))
|
310 |
+
drag_width = float(transformation_info.get("drag_width", 0))
|
311 |
+
drag_height = float(transformation_info.get("drag_height", 0))
|
312 |
+
|
313 |
+
|
314 |
+
scale_ori_fg = canvas_size / max(original_fg.width, original_fg.height)
|
315 |
+
scale_ori_bg = canvas_size / max(original_bg.width, original_bg.height)
|
316 |
+
|
317 |
+
# 计算未缩放状态下(预览中)的默认居中位置(前景图未拖拽时的理想位置)
|
318 |
+
default_left = (canvas_size - data_orig_width) / 2.0
|
319 |
+
default_top = (canvas_size - data_orig_height) / 2.0
|
320 |
+
|
321 |
+
# 在未缩放状态下,计算实际拖拽产生的偏移(单位:像素,在预览尺寸下计算)
|
322 |
+
offset_preview_x = drag_left - default_left
|
323 |
+
offset_preview_y = drag_top - default_top
|
324 |
+
|
325 |
+
offset_ori_x = offset_preview_x / scale_ori_fg
|
326 |
+
offset_ori_y = offset_preview_y / scale_ori_fg
|
327 |
+
|
328 |
+
new_width = int(original_fg.width * scale_ratio)
|
329 |
+
new_height = int(original_fg.height * scale_ratio)
|
330 |
+
scale_fg = original_fg.resize((new_width, new_height))
|
331 |
+
|
332 |
+
output = Image.new("RGBA", (original_fg.width, original_fg.height), (255, 255, 255, 0))
|
333 |
+
output.paste(scale_fg, (int(offset_ori_x), int(offset_ori_y)))
|
334 |
+
|
335 |
+
new_width_fgbg = original_fg.width * scale_ori_fg / scale_ori_bg
|
336 |
+
new_height_fgbg = original_fg.height * scale_ori_fg / scale_ori_bg
|
337 |
+
scale_fgbg = output.resize((int(new_width_fgbg), int(new_height_fgbg)))
|
338 |
+
|
339 |
+
|
340 |
+
final_output = Image.new("RGBA", (original_bg.width, original_bg.height), (255, 255, 255, 0))
|
341 |
+
scale_fgbg = self.pad_or_crop(scale_fgbg, (original_bg.width, original_bg.height), (255, 255, 255, 0))
|
342 |
+
final_output.paste(scale_fgbg, (0, 0))
|
343 |
+
|
344 |
+
fit_fg = self.resize_to_fit_within(original_bg, original_fg)
|
345 |
+
fit_fg = self.pad_or_crop(fit_fg, original_bg.size, (255, 255, 255, 0))
|
346 |
+
|
347 |
+
return final_output, fit_fg
|
348 |
+
|
349 |
+
@torch.inference_mode()
|
350 |
+
def gradio_generate(self, background_img, foreground_img, transformation_info, seed, prompt, enable_gui, cfg=3.5, size_select="1024", text_strength=1, truecfg=False):
|
351 |
+
print("!"*10)
|
352 |
+
"""使用 DreamFuseInference 进行模型推理"""
|
353 |
+
try:
|
354 |
+
trans = json.loads(transformation_info)
|
355 |
+
except:
|
356 |
+
trans = {}
|
357 |
+
|
358 |
+
size_select = int(size_select)
|
359 |
+
|
360 |
+
# import pdb; pdb.set_trace()
|
361 |
+
r, g, b, ori_a = foreground_img.split()
|
362 |
+
fg_img_scale, fg_img = self.transform_foreground_original(foreground_img, background_img, trans)
|
363 |
+
|
364 |
+
new_r, new_g, new_b, new_a = fg_img_scale.split()
|
365 |
+
foreground_img_scale = Image.merge("RGB", (new_r, new_g, new_b))
|
366 |
+
|
367 |
+
r, g, b, ori_a = fg_img.split()
|
368 |
+
foreground_img = Image.merge("RGB", (r, g, b))
|
369 |
+
foreground_img_save = foreground_img.copy()
|
370 |
+
ori_a = ori_a.convert("L")
|
371 |
+
new_a = new_a.convert("L")
|
372 |
+
foreground_img.paste((255, 255, 255), mask=ImageOps.invert(ori_a))
|
373 |
+
print("0"*10)
|
374 |
+
print(foreground_img.size)
|
375 |
+
print(background_img.size)
|
376 |
+
images = self.model_generate(foreground_img.copy(), background_img.copy(),
|
377 |
+
ori_a, new_a,
|
378 |
+
enable_mask_affine=enable_gui,
|
379 |
+
prompt=prompt,
|
380 |
+
offset_cond=[0, 1, 0] if not enable_gui else None,
|
381 |
+
seed=seed,
|
382 |
+
cfg=cfg,
|
383 |
+
size_select=size_select,
|
384 |
+
text_strength=text_strength,
|
385 |
+
truecfg=truecfg)
|
386 |
+
images = Image.fromarray(images[0], "RGB")
|
387 |
+
|
388 |
+
images = images.resize(background_img.size)
|
389 |
+
images_save = images.copy()
|
390 |
+
|
391 |
+
images.thumbnail((640, 640), Image.LANCZOS)
|
392 |
+
return images
|
393 |
+
|
394 |
+
|
395 |
+
@torch.inference_mode()
|
396 |
+
def model_generate(self, fg_image, bg_image, ori_fg_mask, new_fg_mask, enable_mask_affine=True, prompt="", offset_cond=None, seed=None, cfg=3.5, size_select=1024, text_strength=1, truecfg=False):
|
397 |
+
batch_size = 1
|
398 |
+
print("-3"*10)
|
399 |
+
# Prepare images
|
400 |
+
# adjust bg->fg size
|
401 |
+
fg_image, ori_fg_mask = adjust_fg_to_bg(fg_image, ori_fg_mask, bg_image.size)
|
402 |
+
bucket_size = find_nearest_bucket_size(bg_image.size[0], bg_image.size[1], bucket_size=size_select)
|
403 |
+
|
404 |
+
fg_image = fg_image.resize(bucket_size)
|
405 |
+
bg_image = bg_image.resize(bucket_size)
|
406 |
+
|
407 |
+
mask_affine = None
|
408 |
+
if enable_mask_affine:
|
409 |
+
ori_fg_mask = ori_fg_mask.resize(bucket_size)
|
410 |
+
new_fg_mask = new_fg_mask.resize(bucket_size)
|
411 |
+
mask_affine = get_mask_affine(new_fg_mask, ori_fg_mask)
|
412 |
+
|
413 |
+
print("-2"*10)
|
414 |
+
# Get embeddings
|
415 |
+
prompt_embeds, pooled_prompt_embeds, text_ids = self._compute_text_embeddings(prompt)
|
416 |
+
|
417 |
+
prompt_embeds = prompt_embeds.repeat(1, text_strength, 1)
|
418 |
+
text_ids = text_ids.repeat(text_strength, 1)
|
419 |
+
|
420 |
+
# Prepare
|
421 |
+
if self.config.model_choice == "dev":
|
422 |
+
guidance = torch.full([1], cfg, device=self.device, dtype=torch.float32)
|
423 |
+
guidance = guidance.expand(batch_size)
|
424 |
+
else:
|
425 |
+
guidance = None
|
426 |
+
|
427 |
+
# Prepare generator
|
428 |
+
if seed is None:
|
429 |
+
seed = self.config.seed
|
430 |
+
generator = torch.Generator(device=self.device).manual_seed(seed)
|
431 |
+
print("-1"*10)
|
432 |
+
# Prepare condition latents
|
433 |
+
condition_image_latents = self._encode_images([fg_image, bg_image])
|
434 |
+
|
435 |
+
if offset_cond is None:
|
436 |
+
offset_cond = self.config.image_ids_offset
|
437 |
+
offset_cond = offset_cond[1:]
|
438 |
+
cond_latent_image_ids = []
|
439 |
+
for offset_ in offset_cond:
|
440 |
+
cond_latent_image_ids.append(
|
441 |
+
self._prepare_image_ids(
|
442 |
+
condition_image_latents.shape[2] // 2,
|
443 |
+
condition_image_latents.shape[3] // 2,
|
444 |
+
offset_w=offset_ * condition_image_latents.shape[3] // 2
|
445 |
+
)
|
446 |
+
)
|
447 |
+
|
448 |
+
print(1)
|
449 |
+
if mask_affine is not None:
|
450 |
+
affine_H, affine_W = condition_image_latents.shape[2] // 2, condition_image_latents.shape[3] // 2
|
451 |
+
scale_factor = 1 / 16
|
452 |
+
cond_latent_image_ids_fg = cond_latent_image_ids[0].reshape(affine_H, affine_W, 3).clone()
|
453 |
+
|
454 |
+
# opt 1
|
455 |
+
cond_latent_image_ids[0] = warp_affine_tensor(
|
456 |
+
cond_latent_image_ids_fg, mask_affine, output_size=(affine_H, affine_W),
|
457 |
+
scale_factor=scale_factor, device=self.device,
|
458 |
+
)
|
459 |
+
cond_latent_image_ids = torch.stack(cond_latent_image_ids)
|
460 |
+
print(2)
|
461 |
+
# Pack condition latents
|
462 |
+
cond_image_latents = self._pack_latents(condition_image_latents)
|
463 |
+
cond_input = {
|
464 |
+
"image_latents": cond_image_latents,
|
465 |
+
"image_ids": cond_latent_image_ids,
|
466 |
+
}
|
467 |
+
# Prepare initial latents
|
468 |
+
width, height = bucket_size
|
469 |
+
num_channels_latents = self.denoise_model.config.in_channels // 4
|
470 |
+
latents, latent_image_ids = self._prepare_latents(
|
471 |
+
batch_size, num_channels_latents, height, width, generator
|
472 |
+
)
|
473 |
+
print(3)
|
474 |
+
# Setup timesteps
|
475 |
+
sigmas = np.linspace(1.0, 1 / self.config.num_inference_steps, self.config.num_inference_steps)
|
476 |
+
image_seq_len = latents.shape[1]
|
477 |
+
mu = calculate_shift(
|
478 |
+
image_seq_len,
|
479 |
+
self.scheduler.config.base_image_seq_len,
|
480 |
+
self.scheduler.config.max_image_seq_len,
|
481 |
+
self.scheduler.config.base_shift,
|
482 |
+
self.scheduler.config.max_shift,
|
483 |
+
)
|
484 |
+
timesteps, num_inference_steps = retrieve_timesteps(
|
485 |
+
self.scheduler,
|
486 |
+
self.config.num_inference_steps,
|
487 |
+
self.device,
|
488 |
+
sigmas=sigmas,
|
489 |
+
mu=mu,
|
490 |
+
)
|
491 |
+
print(4)
|
492 |
+
# Denoising loop
|
493 |
+
for i, t in enumerate(timesteps):
|
494 |
+
timestep = t.expand(latents.shape[0]).to(latents.dtype)
|
495 |
+
with torch.autocast(enabled=True, device_type="cuda", dtype=self.config.dtype):
|
496 |
+
noise_pred = self.denoise_model(
|
497 |
+
hidden_states=latents,
|
498 |
+
cond_input=cond_input,
|
499 |
+
timestep=timestep / 1000,
|
500 |
+
guidance=guidance,
|
501 |
+
pooled_projections=pooled_prompt_embeds,
|
502 |
+
encoder_hidden_states=prompt_embeds,
|
503 |
+
txt_ids=text_ids,
|
504 |
+
img_ids=latent_image_ids,
|
505 |
+
data_num_per_group=batch_size,
|
506 |
+
image_tags=self.config.image_tags,
|
507 |
+
context_tags=self.config.context_tags,
|
508 |
+
max_sequence_length=self.config.max_sequence_length,
|
509 |
+
mix_attention_double=self.config.mix_attention_double,
|
510 |
+
mix_attention_single=self.config.mix_attention_single,
|
511 |
+
joint_attention_kwargs=None,
|
512 |
+
return_dict=False,
|
513 |
+
)[0]
|
514 |
+
|
515 |
+
if truecfg and i >= 1:
|
516 |
+
guidance_neg = torch.full([1], 1, device=self.device, dtype=torch.float32)
|
517 |
+
guidance_neg = guidance_neg.expand(batch_size)
|
518 |
+
noise_pred_neg = self.denoise_model(
|
519 |
+
hidden_states=latents,
|
520 |
+
cond_input=cond_input,
|
521 |
+
timestep=timestep / 1000,
|
522 |
+
guidance=guidance,
|
523 |
+
pooled_projections=pooled_prompt_embeds,
|
524 |
+
encoder_hidden_states=prompt_embeds,
|
525 |
+
txt_ids=text_ids,
|
526 |
+
img_ids=latent_image_ids,
|
527 |
+
data_num_per_group=batch_size,
|
528 |
+
image_tags=self.config.image_tags,
|
529 |
+
context_tags=self.config.context_tags,
|
530 |
+
max_sequence_length=self.config.max_sequence_length,
|
531 |
+
mix_attention_double=self.config.mix_attention_double,
|
532 |
+
mix_attention_single=self.config.mix_attention_single,
|
533 |
+
joint_attention_kwargs=None,
|
534 |
+
return_dict=False,
|
535 |
+
)[0]
|
536 |
+
noise_pred = noise_pred_neg + 5 * (noise_pred - noise_pred_neg)
|
537 |
+
|
538 |
+
# Compute previous noisy sample
|
539 |
+
latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
|
540 |
+
print(5)
|
541 |
+
# Decode latents
|
542 |
+
latents = self._unpack_latents(latents, height, width)
|
543 |
+
latents = (latents / self.vae.config.scaling_factor) + self.vae.config.shift_factor
|
544 |
+
images = self.vae.decode(latents, return_dict=False)[0]
|
545 |
+
print(6)
|
546 |
+
# Post-process images
|
547 |
+
images = images.add(1).mul(127.5).clamp(0, 255).to(torch.uint8).permute(0, 2, 3, 1).cpu().numpy()
|
548 |
+
return images
|
549 |
+
|
550 |
+
def _encode_images(self, images):
|
551 |
+
return encode_images_cond(self.vae, [images], self.device)
|
552 |
+
|
553 |
+
def _prepare_image_ids(self, h, w, offset_w=0):
|
554 |
+
return _prepare_image_ids(h, w, offset_w=offset_w).to(self.device)
|
555 |
+
|
556 |
+
def _pack_latents(self, latents):
|
557 |
+
b, c, h, w = latents.shape
|
558 |
+
return _pack_latents(latents, b, c, h, w)
|
559 |
+
|
560 |
+
def _unpack_latents(self, latents, height, width):
|
561 |
+
vae_scale = 2 ** (len(self.vae.config.block_out_channels) - 1)
|
562 |
+
return _unpack_latents(latents, height, width, vae_scale)
|
563 |
+
|
564 |
+
def _prepare_latents(self, batch_size, num_channels_latents, height, width, generator):
|
565 |
+
vae_scale = 2 ** (len(self.vae.config.block_out_channels) - 1)
|
566 |
+
latents, latent_image_ids = prepare_latents(
|
567 |
+
batch_size=batch_size,
|
568 |
+
num_channels_latents=num_channels_latents,
|
569 |
+
vae_downsample_factor=vae_scale,
|
570 |
+
height=height,
|
571 |
+
width=width,
|
572 |
+
dtype=self.config.dtype,
|
573 |
+
device=self.device,
|
574 |
+
generator=generator,
|
575 |
+
offset=None
|
576 |
+
)
|
577 |
+
return latents, latent_image_ids
|
578 |
+
|
579 |
+
def main():
|
580 |
+
parser = transformers.HfArgumentParser(InferenceConfig)
|
581 |
+
config: InferenceConfig = parser.parse_args_into_dataclasses()[0]
|
582 |
+
model = DreamFuseInference(config)
|
583 |
+
os.makedirs(config.valid_output_dir, exist_ok=True)
|
584 |
+
for valid_root, valid_json in zip(config.valid_roots, config.valid_jsons):
|
585 |
+
with open(valid_json, 'r') as f:
|
586 |
+
valid_info = json.load(f)
|
587 |
+
|
588 |
+
# multi gpu
|
589 |
+
to_process = sorted(list(valid_info.keys()))
|
590 |
+
|
591 |
+
# debug
|
592 |
+
to_process = [k for k in to_process if "data_wear" in k and "pixelwave" in k]
|
593 |
+
# debug
|
594 |
+
|
595 |
+
sd_idx = len(to_process) // config.total_num * config.sub_idx
|
596 |
+
ed_idx = len(to_process) // config.total_num * (config.sub_idx + 1)
|
597 |
+
if config.sub_idx < config.total_num - 1:
|
598 |
+
print(config.sub_idx, sd_idx, ed_idx)
|
599 |
+
to_process = to_process[sd_idx:ed_idx]
|
600 |
+
else:
|
601 |
+
print(config.sub_idx, sd_idx)
|
602 |
+
to_process = to_process[sd_idx:]
|
603 |
+
valid_info = {k: valid_info[k] for k in to_process}
|
604 |
+
|
605 |
+
for meta_key, info in tqdm(valid_info.items()):
|
606 |
+
img_name = meta_key.split('/')[-1]
|
607 |
+
|
608 |
+
foreground_img = Image.open(os.path.join(valid_root, info['img_info']['000']))
|
609 |
+
background_img = Image.open(os.path.join(valid_root, info['img_info']['001']))
|
610 |
+
|
611 |
+
new_fg_mask = Image.open(os.path.join(valid_root, info['img_mask_info']['000_mask_scale']))
|
612 |
+
ori_fg_mask = Image.open(os.path.join(valid_root, info['img_mask_info']['000']))
|
613 |
+
|
614 |
+
# debug
|
615 |
+
foreground_img.save(os.path.join(config.valid_output_dir, f"{img_name}_0.png"))
|
616 |
+
background_img.save(os.path.join(config.valid_output_dir, f"{img_name}_1.png"))
|
617 |
+
ori_fg_mask.save(os.path.join(config.valid_output_dir, f"{img_name}_0_mask.png"))
|
618 |
+
new_fg_mask.save(os.path.join(config.valid_output_dir, f"{img_name}_0_mask_scale.png"))
|
619 |
+
# debug
|
620 |
+
|
621 |
+
foreground_img.paste((255, 255, 255), mask=ImageOps.invert(ori_fg_mask))
|
622 |
+
|
623 |
+
images = model(foreground_img.copy(), background_img.copy(),
|
624 |
+
ori_fg_mask, new_fg_mask,
|
625 |
+
prompt=config.ref_prompts,
|
626 |
+
seed=config.seed,
|
627 |
+
cfg=config.guidance_scale,
|
628 |
+
size_select=config.inference_scale,
|
629 |
+
text_strength=config.text_strength,
|
630 |
+
truecfg=config.truecfg)
|
631 |
+
|
632 |
+
result_image = Image.fromarray(images[0], "RGB")
|
633 |
+
result_image = result_image.resize(background_img.size)
|
634 |
+
result_image.save(os.path.join(config.valid_output_dir, f"{img_name}_2.png"))
|
635 |
+
# Make grid
|
636 |
+
grid_image = [foreground_img, background_img] + [result_image]
|
637 |
+
result = make_image_grid(grid_image, 1, len(grid_image), size=result_image.size)
|
638 |
+
|
639 |
+
result.save(os.path.join(config.valid_output_dir, f"{img_name}.jpg"))
|
640 |
+
|
641 |
+
if __name__ == "__main__":
|
642 |
+
main()
|
examples/9_01.png
ADDED
![]() |
Git LFS Details
|
examples/9_02.png
ADDED
![]() |
Git LFS Details
|
output_images/no_bg_image.png
ADDED
![]() |
Git LFS Details
|
requirements.txt
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
lmdb==1.4.1
|
2 |
+
tqdm==4.66.1
|
3 |
+
datasets
|
4 |
+
tensorboardX
|
5 |
+
accelerate
|
6 |
+
ninja
|
7 |
+
transformers==4.46.3
|
8 |
+
pycocotools==2.0.7
|
9 |
+
scikit-image
|
10 |
+
Pillow==9.5.0
|
11 |
+
opencv-python
|
12 |
+
opencv-python-headless
|
13 |
+
datasets
|
14 |
+
einops==0.8.0
|
15 |
+
sentencepiece
|
16 |
+
pydantic==2.9.2
|
17 |
+
deepspeed
|
18 |
+
peft==0.14.0
|
19 |
+
diffusers==0.32.0
|
20 |
+
rotary-embedding-torch==0.8.4
|
21 |
+
tiktoken==0.8.0
|
22 |
+
transformers_stream_generator==0.0.5
|
23 |
+
ftfy
|
24 |
+
bs4
|
25 |
+
bson==0.5.10
|
26 |
+
gradio==5.12.0
|
27 |
+
httpx
|
28 |
+
fairscale==0.4.13
|
29 |
+
kornia
|
30 |
+
timm==1.0.9
|
31 |
+
protobuf==3.20.0
|
32 |
+
basicsr
|
33 |
+
sentencepiece
|
34 |
+
huggingface_hub
|
35 |
+
prodigyopt
|
36 |
+
torch==2.4.0
|
37 |
+
torchvision==0.19.0
|