LL3RD commited on
Commit
f96f677
·
0 Parent(s):
.gitattributes ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.png filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: DreamFuse
3
+ emoji: 📚
4
+ colorFrom: indigo
5
+ colorTo: blue
6
+ sdk: gradio
7
+ sdk_version: 5.24.0
8
+ app_file: app.py
9
+ pinned: false
10
+ license: mit
11
+ ---
12
+
13
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
__pycache__/dreamfuse_inference.cpython-310.pyc ADDED
Binary file (14.1 kB). View file
 
app.py ADDED
@@ -0,0 +1,491 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import spaces
3
+ from PIL import Image, ImageDraw, ImageOps
4
+ import base64, json
5
+ from io import BytesIO
6
+ import torch.nn.functional as F
7
+ import json
8
+ from typing import List
9
+ from dataclasses import dataclass, field
10
+ from dreamfuse_inference import DreamFuseInference, InferenceConfig
11
+ import numpy as np
12
+ import os
13
+ from transformers import AutoModelForImageSegmentation
14
+ from torchvision import transforms
15
+ import torch
16
+ import subprocess
17
+ subprocess.run("rm -rf /data-nvme/zerogpu-offload/*", env={}, shell=True)
18
+ generated_images = []
19
+
20
+
21
+ RMBG_model = AutoModelForImageSegmentation.from_pretrained('briaai/RMBG-2.0', trust_remote_code=True)
22
+ RMBG_model = RMBG_model.to("cuda")
23
+ transform = transforms.Compose([
24
+ transforms.Resize((1024, 1024)),
25
+ transforms.ToTensor(),
26
+ transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
27
+ ])
28
+
29
+ @spaces.GPU
30
+ def remove_bg(image):
31
+ im = image.convert("RGB")
32
+ input_tensor = transform(im).unsqueeze(0).to("cuda")
33
+ with torch.no_grad():
34
+ preds = RMBG_model(input_tensor)[-1].sigmoid().cpu()[0].squeeze()
35
+ mask = transforms.ToPILImage()(preds).resize(im.size)
36
+ return mask
37
+
38
+ class DreamblendGUI:
39
+ def __init__(self):
40
+ self.examples = [
41
+ ["./examples/9_02.png",
42
+ "./examples/9_01.png"],
43
+ ]
44
+ self.examples = [[Image.open(x) for x in example] for example in self.examples]
45
+ self.css_style = self._get_css_style()
46
+ self.js_script = self._get_js_script()
47
+
48
+ def _get_css_style(self):
49
+ return """
50
+ body {
51
+ background: transparent;
52
+ font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
53
+ color: #fff;
54
+ }
55
+ .gradio-container {
56
+ max-width: 1200px;
57
+ margin: auto;
58
+ background: transparent;
59
+ border-radius: 10px;
60
+ padding: 20px;
61
+ box-shadow: 0px 2px 8px rgba(255,255,255,0.1);
62
+ }
63
+ h1, h2 {
64
+ text-align: center;
65
+ color: #fff;
66
+ }
67
+ #canvas_preview {
68
+ border: 2px dashed rgba(255,255,255,0.5);
69
+ padding: 10px;
70
+ background: transparent;
71
+ border-radius: 8px;
72
+ }
73
+ .gr-button {
74
+ background-color: #007bff;
75
+ border: none;
76
+ color: #fff;
77
+ padding: 10px 20px;
78
+ border-radius: 5px;
79
+ font-size: 16px;
80
+ cursor: pointer;
81
+ }
82
+ .gr-button:hover {
83
+ background-color: #0056b3;
84
+ }
85
+ #small-examples {
86
+ max-width: 200px !important;
87
+ width: 200px !important;
88
+ float: left;
89
+ margin-right: 20px;
90
+ }
91
+ """
92
+
93
+ def _get_js_script(self):
94
+ return r"""
95
+ async () => {
96
+ window.updateTransformation = function() {
97
+ const img = document.getElementById('draggable-img');
98
+ const container = document.getElementById('canvas-container');
99
+ if (!img || !container) return;
100
+ const left = parseFloat(img.style.left) || 0;
101
+ const top = parseFloat(img.style.top) || 0;
102
+
103
+ const canvasSize = 400;
104
+ const data_original_width = parseFloat(img.getAttribute('data-original-width'));
105
+ const data_original_height = parseFloat(img.getAttribute('data-original-height'));
106
+ const bgWidth = parseFloat(container.dataset.bgWidth);
107
+ const bgHeight = parseFloat(container.dataset.bgHeight);
108
+ const scale_ratio = img.clientWidth / data_original_width;
109
+
110
+ const transformation = {
111
+ drag_left: left,
112
+ drag_top: top,
113
+ drag_width: img.clientWidth,
114
+ drag_height: img.clientHeight,
115
+ data_original_width: data_original_width,
116
+ data_original_height: data_original_height,
117
+ scale_ratio: scale_ratio
118
+ };
119
+
120
+ const transInput = document.querySelector("#transformation_info textarea");
121
+ if(transInput){
122
+ const newValue = JSON.stringify(transformation);
123
+ const nativeSetter = Object.getOwnPropertyDescriptor(window.HTMLTextAreaElement.prototype, 'value').set;
124
+ nativeSetter.call(transInput, newValue);
125
+ transInput.dispatchEvent(new Event('input', { bubbles: true }));
126
+ console.log("Transformation info updated: ", newValue);
127
+ } else {
128
+ console.log("找不到 transformation_info 的 textarea 元素");
129
+ }
130
+ };
131
+
132
+ globalThis.initializeDrag = () => {
133
+ console.log("初始化拖拽与缩放功能...");
134
+ const observer = new MutationObserver(() => {
135
+ const img = document.getElementById('draggable-img');
136
+ const container = document.getElementById('canvas-container');
137
+ const slider = document.getElementById('scale-slider');
138
+ if (img && container && slider) {
139
+ observer.disconnect();
140
+ console.log("绑定拖拽与缩放事件...");
141
+ img.ondragstart = (e) => { e.preventDefault(); return false; };
142
+ let offsetX = 0, offsetY = 0;
143
+ let isDragging = false;
144
+ let scaleAnchor = null;
145
+
146
+ img.addEventListener('mousedown', (e) => {
147
+ isDragging = true;
148
+ img.style.cursor = 'grabbing';
149
+ const imgRect = img.getBoundingClientRect();
150
+ offsetX = e.clientX - imgRect.left;
151
+ offsetY = e.clientY - imgRect.top;
152
+ img.style.transform = "none";
153
+ img.style.left = img.offsetLeft + "px";
154
+ img.style.top = img.offsetTop + "px";
155
+ console.log("mousedown: left=", img.style.left, "top=", img.style.top);
156
+ });
157
+ document.addEventListener('mousemove', (e) => {
158
+ if (!isDragging) return;
159
+ e.preventDefault();
160
+
161
+ const containerRect = container.getBoundingClientRect();
162
+ // 计算当前拖拽后的坐标(基于容器)
163
+ let left = e.clientX - containerRect.left - offsetX;
164
+ let top = e.clientY - containerRect.top - offsetY;
165
+
166
+ // 允许的拖拽范围:
167
+ // 水平方向允许最少超出图像一半:最小值为 -img.clientWidth * (7/8)
168
+ // 水平方向允许最多超出一半:最大值为 containerRect.width - img.clientWidth * (1/8)
169
+ const minLeft = -img.clientWidth * (7/8);
170
+ const maxLeft = containerRect.width - img.clientWidth * (1/8);
171
+
172
+ // 垂直方向允许范围:
173
+ // 最小值为 -img.clientHeight * (7/8)
174
+ // 最大值为 containerRect.height - img.clientHeight * (1/8)
175
+ const minTop = -img.clientHeight * (7/8);
176
+ const maxTop = containerRect.height - img.clientHeight * (1/8);
177
+
178
+ // 限制范围
179
+ if (left < minLeft) left = minLeft;
180
+ if (left > maxLeft) left = maxLeft;
181
+
182
+ if (top < minTop) top = minTop;
183
+ if (top > maxTop) top = maxTop;
184
+
185
+ img.style.left = left + "px";
186
+ img.style.top = top + "px";
187
+ });
188
+
189
+ window.addEventListener('mouseup', (e) => {
190
+ if (isDragging) {
191
+ isDragging = false;
192
+ img.style.cursor = 'grab';
193
+ const containerRect = container.getBoundingClientRect();
194
+ const bgWidth = parseFloat(container.dataset.bgWidth);
195
+ const bgHeight = parseFloat(container.dataset.bgHeight);
196
+ const offsetLeft = (containerRect.width - bgWidth) / 2;
197
+ const offsetTop = (containerRect.height - bgHeight) / 2;
198
+ const absoluteLeft = parseFloat(img.style.left);
199
+ const absoluteTop = parseFloat(img.style.top);
200
+ const relativeX = absoluteLeft - offsetLeft;
201
+ const relativeY = absoluteTop - offsetTop;
202
+ document.getElementById("coordinate").textContent =
203
+ `前景图坐标: (x=${relativeX.toFixed(2)}, y=${relativeY.toFixed(2)})`;
204
+ updateTransformation();
205
+ }
206
+ scaleAnchor = null;
207
+ });
208
+
209
+ slider.addEventListener('mousedown', (e) => {
210
+ const containerRect = container.getBoundingClientRect();
211
+ const imgRect = img.getBoundingClientRect();
212
+ scaleAnchor = {
213
+ x: imgRect.left + imgRect.width/2 - containerRect.left,
214
+ y: imgRect.top + imgRect.height/2 - containerRect.top
215
+ };
216
+ console.log("Slider mousedown, captured scaleAnchor: ", scaleAnchor);
217
+ });
218
+
219
+ slider.addEventListener('input', (e) => {
220
+ const scale = parseFloat(e.target.value);
221
+ const originalWidth = parseFloat(img.getAttribute('data-original-width'));
222
+ const originalHeight = parseFloat(img.getAttribute('data-original-height'));
223
+ const newWidth = originalWidth * scale;
224
+ const newHeight = originalHeight * scale;
225
+ const containerRect = container.getBoundingClientRect();
226
+ let centerX, centerY;
227
+ if (scaleAnchor) {
228
+ centerX = scaleAnchor.x;
229
+ centerY = scaleAnchor.y;
230
+ } else {
231
+ const imgRect = img.getBoundingClientRect();
232
+ centerX = imgRect.left + imgRect.width/2 - containerRect.left;
233
+ centerY = imgRect.top + imgRect.height/2 - containerRect.top;
234
+ }
235
+ const newLeft = centerX - newWidth/2;
236
+ const newTop = centerY - newHeight/2;
237
+ img.style.width = newWidth + "px";
238
+ img.style.height = newHeight + "px";
239
+ img.style.left = newLeft + "px";
240
+ img.style.top = newTop + "px";
241
+ console.log("slider: scale=", scale, "newWidth=", newWidth, "newHeight=", newHeight);
242
+ updateTransformation();
243
+ });
244
+
245
+ slider.addEventListener('mouseup', (e) => {
246
+ scaleAnchor = null;
247
+ });
248
+ }
249
+ });
250
+ observer.observe(document.body, { childList: true, subtree: true });
251
+ };
252
+ }
253
+ """
254
+
255
+
256
+ def get_next_sequence(self, folder_path):
257
+ # 列出文件夹中的所有文件名
258
+ filenames = os.listdir(folder_path)
259
+ # 提取文件名中的序列号部分(假设是前三位数字)
260
+ sequences = [int(name.split('_')[0]) for name in filenames if name.split('_')[0].isdigit()]
261
+ # 找到最大序列号
262
+ max_sequence = max(sequences, default=-1)
263
+ # 返回下一位序列号,格式为三位数字(如002)
264
+ return f"{max_sequence + 1:03d}"
265
+
266
+
267
+ def pil_to_base64(self, img):
268
+ """将 PIL Image 转为 base64 字符串,PNG 格式下保留透明通道"""
269
+ if img is None:
270
+ return ""
271
+ if img.mode != "RGBA":
272
+ img = img.convert("RGBA")
273
+ buffered = BytesIO()
274
+ img.save(buffered, format="PNG", optimize=True)
275
+ img_bytes = buffered.getvalue()
276
+ base64_str = base64.b64encode(img_bytes).decode()
277
+ return f"data:image/png;base64,{base64_str}"
278
+
279
+ def resize_background_image(self, img, max_size=400):
280
+ """将背景图等比例缩放到最长边为 max_size(400)"""
281
+ if img is None:
282
+ return None
283
+ w, h = img.size
284
+ if w > max_size or h > max_size:
285
+ ratio = min(max_size / w, max_size / h)
286
+ new_w, new_h = int(w * ratio), int(h * ratio)
287
+ img = img.resize((new_w, new_h), Image.LANCZOS)
288
+ return img
289
+
290
+ def resize_draggable_image(self, img, max_size=400):
291
+ """将前景图等比例缩放到最长边不超过 max_size(400)"""
292
+ if img is None:
293
+ return None
294
+ w, h = img.size
295
+ if w > max_size or h > max_size:
296
+ ratio = min(max_size / w, max_size / h)
297
+ new_w, new_h = int(w * ratio), int(h * ratio)
298
+ img = img.resize((new_w, new_h), Image.LANCZOS)
299
+ return img
300
+
301
+ def generate_html(self, background_img_b64, bg_width, bg_height, draggable_img_b64, draggable_width, draggable_height, canvas_size=400):
302
+ """生成预览 HTML 页面"""
303
+ html_code = f"""
304
+ <html>
305
+ <head>
306
+ <style>
307
+ body {{
308
+ margin: 0;
309
+ padding: 0;
310
+ text-align: center;
311
+ font-family: sans-serif;
312
+ background: transparent;
313
+ color: #fff;
314
+ }}
315
+ h2 {{
316
+ margin-top: 1rem;
317
+ }}
318
+ #scale-control {{
319
+ margin: 1rem auto;
320
+ width: 400px;
321
+ text-align: left;
322
+ }}
323
+ #scale-control label {{
324
+ font-size: 1rem;
325
+ margin-right: 0.5rem;
326
+ }}
327
+ #canvas-container {{
328
+ position: relative;
329
+ width: {canvas_size}px;
330
+ height: {canvas_size}px;
331
+ margin: 0 auto;
332
+ border: 1px dashed rgba(255,255,255,0.5);
333
+ overflow: hidden;
334
+ background-image: url('{background_img_b64}');
335
+ background-repeat: no-repeat;
336
+ background-position: center;
337
+ background-size: contain;
338
+ border-radius: 8px;
339
+ }}
340
+ #draggable-img {{
341
+ position: absolute;
342
+ cursor: grab;
343
+ left: 50%;
344
+ top: 50%;
345
+ transform: translate(-50%, -50%);
346
+ background-color: transparent;
347
+ }}
348
+ #coordinate {{
349
+ color: #fff;
350
+ margin-top: 1rem;
351
+ font-weight: bold;
352
+ }}
353
+ </style>
354
+ </head>
355
+ <body>
356
+ <h2>拖拽前景图(支持缩放)</h2>
357
+ <div id="scale-control">
358
+ <label for="scale-slider">前景图缩放:</label>
359
+ <input type="range" id="scale-slider" min="0.1" max="2" step="0.01" value="1">
360
+ </div>
361
+ <div id="canvas-container" data-bg-width="{bg_width}" data-bg-height="{bg_height}">
362
+ <img id="draggable-img"
363
+ src="{draggable_img_b64}"
364
+ alt="Draggable Image"
365
+ draggable="false"
366
+ data-original-width="{draggable_width}"
367
+ data-original-height="{draggable_height}"
368
+ />
369
+ </div>
370
+ <p id="coordinate">前景图坐标: (x=?, y=?)</p>
371
+ </body>
372
+ </html>
373
+ """
374
+ return html_code
375
+
376
+ def on_upload(self, background_img, draggable_img):
377
+ """上传图片后的处理"""
378
+ if background_img is None or draggable_img is None:
379
+ return "<p style='color:red;'>请先上传背景图片和可拖拽图片。</p>"
380
+
381
+ if draggable_img.mode != "RGB":
382
+ draggable_img = draggable_img.convert("RGB")
383
+ draggable_img_mask = remove_bg(draggable_img)
384
+ alpha_channel = draggable_img_mask.convert("L")
385
+ draggable_img = draggable_img.convert("RGBA")
386
+ draggable_img.putalpha(alpha_channel)
387
+
388
+ resized_bg = self.resize_background_image(background_img, max_size=400)
389
+ bg_w, bg_h = resized_bg.size
390
+
391
+ resized_fg = self.resize_draggable_image(draggable_img, max_size=400)
392
+ draggable_width, draggable_height = resized_fg.size
393
+
394
+ background_img_b64 = self.pil_to_base64(resized_bg)
395
+ draggable_img_b64 = self.pil_to_base64(resized_fg)
396
+
397
+ return self.generate_html(
398
+ background_img_b64, bg_w, bg_h,
399
+ draggable_img_b64, draggable_width, draggable_height,
400
+ canvas_size=400
401
+ ), draggable_img
402
+
403
+ def save_image(self, save_path = "/mnt/bn/hjj-humanseg-lq/SubjectDriven/DreamFuse/debug"):
404
+ global generated_images
405
+ save_name = self.get_next_sequence(save_path)
406
+ generated_images[0].save(os.path.join(save_path, f"{save_name}_0_ori.png"))
407
+ generated_images[1].save(os.path.join(save_path, f"{save_name}_0.png"))
408
+ generated_images[2].save(os.path.join(save_path, f"{save_name}_1.png"))
409
+ generated_images[3].save(os.path.join(save_path, f"{save_name}_2.png"))
410
+ generated_images[4].save(os.path.join(save_path, f"{save_name}_0_mask.png"))
411
+ generated_images[5].save(os.path.join(save_path, f"{save_name}_0_mask_scale.png"))
412
+ generated_images[6].save(os.path.join(save_path, f"{save_name}_0_scale.png"))
413
+ generated_images[7].save(os.path.join(save_path, f"{save_name}_2_pasted.png"))
414
+
415
+
416
+ def create_gui(self):
417
+ config = InferenceConfig()
418
+ config.lora_id = 'LL3RD/DreamFuse'
419
+
420
+ pipeline = DreamFuseInference(config)
421
+ pipeline.gradio_generate = spaces.GPU(duratioin=120)(pipeline.gradio_generate)
422
+ """创建 Gradio 界面"""
423
+ with gr.Blocks(css=self.css_style) as demo:
424
+ modified_fg_state = gr.State()
425
+ gr.Markdown("# Dreamblend-GUI-dirtydata")
426
+ gr.Markdown("通过上传背景图与前景图生成带有可拖拽/缩放预览的合成图像,同时支持 Seed 设置和 Prompt 文本输入。")
427
+ with gr.Row():
428
+ with gr.Column(scale=1):
429
+ gr.Markdown("### 上传图片")
430
+ background_img_in = gr.Image(label="背景图片", type="pil", height=240, width=240)
431
+ draggable_img_in = gr.Image(label="前景图片", type="pil", image_mode="RGBA", height=240, width=240)
432
+ generate_btn = gr.Button("生成可拖拽画布")
433
+
434
+ with gr.Row():
435
+ gr.Examples(
436
+ examples=[self.examples[0]],
437
+ inputs=[background_img_in, draggable_img_in],
438
+ elem_id="small-examples"
439
+ )
440
+ with gr.Column(scale=1):
441
+ gr.Markdown("### 预览区域")
442
+ html_out = gr.HTML(label="预览与拖拽", elem_id="canvas_preview")
443
+
444
+ with gr.Row():
445
+ with gr.Column(scale=1):
446
+ gr.Markdown("### 参数设置")
447
+ seed_slider = gr.Slider(minimum=0, maximum=10000, step=1, label="Seed", value=42)
448
+ cfg_slider = gr.Slider(minimum=1, maximum=10, step=0.1, label="CFG", value=3.5)
449
+ size_select = gr.Radio(
450
+ choices=["512", "768", "1024"],
451
+ value="512",
452
+ label="生成质量(512-差 1024-好)",
453
+ )
454
+ prompt_text = gr.Textbox(label="Prompt", placeholder="输入文本提示", value="")
455
+ text_strength = gr.Slider(minimum=1, maximum=10, step=1, label="Text Strength", value=1)
456
+ enable_gui = gr.Checkbox(label="启用GUI", value=True)
457
+ enable_truecfg = gr.Checkbox(label="启用TrueCFG", value=False)
458
+ enable_save = gr.Button("保存图片 (内部测试)", visible=True)
459
+ with gr.Column(scale=1):
460
+ gr.Markdown("### 模型生成结果")
461
+ model_generate_btn = gr.Button("模型生成")
462
+ transformation_text = gr.Textbox(label="Transformation Info", elem_id="transformation_info", visible=False)
463
+ model_output = gr.Image(label="模型输出", type="pil")
464
+
465
+ # 交互事件绑定
466
+ enable_save.click(fn=self.save_image, inputs=None, outputs=None)
467
+ generate_btn.click(
468
+ fn=self.on_upload,
469
+ inputs=[background_img_in, draggable_img_in],
470
+ outputs=[html_out, modified_fg_state],
471
+ )
472
+ model_generate_btn.click(
473
+ fn=pipeline.gradio_generate,
474
+ inputs=[background_img_in, modified_fg_state, transformation_text, seed_slider, \
475
+ prompt_text, enable_gui, cfg_slider, size_select, text_strength, enable_truecfg],
476
+ outputs=model_output
477
+ )
478
+ # 页面加载后初始化拖拽/缩放事件
479
+ demo.load(None, None, None, js=self.js_script)
480
+ generate_btn.click(fn=None, inputs=None, outputs=None, js="initializeDrag")
481
+
482
+ return demo
483
+
484
+ if __name__ == "__main__":
485
+
486
+ gui = DreamblendGUI()
487
+ demo = gui.create_gui()
488
+ demo.queue()
489
+ demo.launch()
490
+ # demo.launch(server_port=7789, ssr_mode=False)
491
+ # demo.launch(server_name="[::]", share=True)
dreamfuse/.DS_Store ADDED
Binary file (6.15 kB). View file
 
dreamfuse/models/dreamfuse_flux/__pycache__/flux_processor.cpython-310.pyc ADDED
Binary file (7.61 kB). View file
 
dreamfuse/models/dreamfuse_flux/__pycache__/transformer.cpython-310.pyc ADDED
Binary file (23.9 kB). View file
 
dreamfuse/models/dreamfuse_flux/flux_processor.py ADDED
@@ -0,0 +1,269 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import inspect
2
+ import math
3
+ from typing import Callable, List, Optional, Tuple, Union
4
+
5
+ import torch
6
+ import torch.nn.functional as F
7
+ from torch import nn
8
+
9
+ from diffusers.image_processor import IPAdapterMaskProcessor
10
+ from diffusers.utils import deprecate, logging
11
+ from diffusers.utils.import_utils import is_torch_npu_available, is_xformers_available
12
+ from diffusers.utils.torch_utils import is_torch_version, maybe_allow_in_graph
13
+ from diffusers.models.attention import Attention
14
+ from diffusers.models.embeddings import Timesteps, TimestepEmbedding, PixArtAlphaTextProjection
15
+
16
+ class CombinedTimestepGuidanceTextProjEmbeddings(nn.Module):
17
+ def __init__(self, embedding_dim, pooled_projection_dim):
18
+ super().__init__()
19
+
20
+ self.time_proj = Timesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=0)
21
+ self.timestep_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=embedding_dim)
22
+ self.guidance_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=embedding_dim)
23
+ self.text_embedder = PixArtAlphaTextProjection(pooled_projection_dim, embedding_dim, act_fn="silu")
24
+
25
+ def forward(self, timestep, guidance, pooled_projection):
26
+ timesteps_proj = self.time_proj(timestep)
27
+ timesteps_emb = self.timestep_embedder(timesteps_proj.to(dtype=pooled_projection.dtype)) # (N, D)
28
+
29
+ if (guidance >= 0).all():
30
+ guidance_proj = self.time_proj(guidance)
31
+ guidance_emb = self.guidance_embedder(guidance_proj.to(dtype=pooled_projection.dtype)) # (N, D)
32
+
33
+ time_guidance_emb = timesteps_emb + guidance_emb
34
+
35
+ pooled_projections = self.text_embedder(pooled_projection)
36
+ conditioning = time_guidance_emb + pooled_projections
37
+ else:
38
+ pooled_projections = self.text_embedder(pooled_projection)
39
+ conditioning = timesteps_emb + pooled_projections
40
+
41
+ return conditioning
42
+
43
+
44
+ def apply_rotary_emb(
45
+ x: torch.Tensor,
46
+ freqs_cis: Union[torch.Tensor, Tuple[torch.Tensor]],
47
+ use_real: bool = True,
48
+ use_real_unbind_dim: int = -1,
49
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
50
+ """
51
+ Apply rotary embeddings to input tensors using the given frequency tensor. This function applies rotary embeddings
52
+ to the given query or key 'x' tensors using the provided frequency tensor 'freqs_cis'. The input tensors are
53
+ reshaped as complex numbers, and the frequency tensor is reshaped for broadcasting compatibility. The resulting
54
+ tensors contain rotary embeddings and are returned as real tensors.
55
+
56
+ Args:
57
+ x (`torch.Tensor`):
58
+ Query or key tensor to apply rotary embeddings. [B, H, S, D] xk (torch.Tensor): Key tensor to apply
59
+ freqs_cis (`Tuple[torch.Tensor]`): Precomputed frequency tensor for complex exponentials. ([S, D], [S, D],)
60
+
61
+ Returns:
62
+ Tuple[torch.Tensor, torch.Tensor]: Tuple of modified query tensor and key tensor with rotary embeddings.
63
+ """
64
+ if use_real:
65
+ cos, sin = freqs_cis # [S, D]
66
+ if cos.ndim == 2:
67
+ cos = cos[None, None]
68
+ else:
69
+ cos = cos.unsqueeze(1)
70
+ if sin.ndim == 2:
71
+ sin = sin[None, None]
72
+ else:
73
+ sin = sin.unsqueeze(1)
74
+ cos, sin = cos.to(x.device), sin.to(x.device)
75
+
76
+ if use_real_unbind_dim == -1:
77
+ # Used for flux, cogvideox, hunyuan-dit
78
+ x_real, x_imag = x.reshape(*x.shape[:-1], -1, 2).unbind(-1) # [B, S, H, D//2]
79
+ x_rotated = torch.stack([-x_imag, x_real], dim=-1).flatten(3)
80
+ elif use_real_unbind_dim == -2:
81
+ # Used for Stable Audio
82
+ x_real, x_imag = x.reshape(*x.shape[:-1], 2, -1).unbind(-2) # [B, S, H, D//2]
83
+ x_rotated = torch.cat([-x_imag, x_real], dim=-1)
84
+ else:
85
+ raise ValueError(f"`use_real_unbind_dim={use_real_unbind_dim}` but should be -1 or -2.")
86
+
87
+ out = (x.float() * cos + x_rotated.float() * sin).to(x.dtype)
88
+
89
+ return out
90
+ else:
91
+ # used for lumina
92
+ x_rotated = torch.view_as_complex(x.float().reshape(*x.shape[:-1], -1, 2))
93
+ freqs_cis = freqs_cis.unsqueeze(2)
94
+ x_out = torch.view_as_real(x_rotated * freqs_cis).flatten(3)
95
+
96
+ return x_out.type_as(x)
97
+
98
+ class FluxAttnSharedProcessor2_0:
99
+ """Attention processor used typically in processing the SD3-like self-attention projections."""
100
+
101
+ def __init__(self):
102
+ if not hasattr(F, "scaled_dot_product_attention"):
103
+ raise ImportError("FluxAttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
104
+
105
+ def __call__(
106
+ self,
107
+ attn: Attention,
108
+ hidden_states: torch.FloatTensor,
109
+ encoder_hidden_states: torch.FloatTensor = None,
110
+ attention_mask: Optional[torch.FloatTensor] = None,
111
+ image_rotary_emb: Optional[torch.Tensor] = None,
112
+ data_num_per_group: Optional[int] = 1,
113
+ max_sequence_length: Optional[int] = 512,
114
+ mix_attention: bool = True,
115
+ cond_latents = None,
116
+ cond_image_rotary_emb = None,
117
+ work_mode = None,
118
+ mask_cond = None,
119
+ ) -> torch.FloatTensor:
120
+ with_cond = cond_latents is not None and mix_attention
121
+
122
+ batch_size, _, _ = hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
123
+
124
+ # `sample` projections.
125
+ query = attn.to_q(hidden_states)
126
+ key = attn.to_k(hidden_states)
127
+ value = attn.to_v(hidden_states)
128
+
129
+ inner_dim = key.shape[-1]
130
+ head_dim = inner_dim // attn.heads
131
+
132
+ query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
133
+ key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
134
+ value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
135
+
136
+ if attn.norm_q is not None:
137
+ query = attn.norm_q(query)
138
+ if attn.norm_k is not None:
139
+ key = attn.norm_k(key)
140
+
141
+
142
+ # the attention in FluxSingleTransformerBlock does not use `encoder_hidden_states`
143
+ if encoder_hidden_states is not None:
144
+ # `context` projections.
145
+ encoder_hidden_states_query_proj = attn.add_q_proj(encoder_hidden_states)
146
+ encoder_hidden_states_key_proj = attn.add_k_proj(encoder_hidden_states)
147
+ encoder_hidden_states_value_proj = attn.add_v_proj(encoder_hidden_states)
148
+
149
+ encoder_hidden_states_query_proj = encoder_hidden_states_query_proj.view(
150
+ batch_size, -1, attn.heads, head_dim
151
+ ).transpose(1, 2)
152
+ encoder_hidden_states_key_proj = encoder_hidden_states_key_proj.view(
153
+ batch_size, -1, attn.heads, head_dim
154
+ ).transpose(1, 2)
155
+ encoder_hidden_states_value_proj = encoder_hidden_states_value_proj.view(
156
+ batch_size, -1, attn.heads, head_dim
157
+ ).transpose(1, 2)
158
+
159
+ if attn.norm_added_q is not None:
160
+ encoder_hidden_states_query_proj = attn.norm_added_q(encoder_hidden_states_query_proj)
161
+ if attn.norm_added_k is not None:
162
+ encoder_hidden_states_key_proj = attn.norm_added_k(encoder_hidden_states_key_proj)
163
+
164
+ # attention
165
+ query = torch.cat([encoder_hidden_states_query_proj, query], dim=2)
166
+ key = torch.cat([encoder_hidden_states_key_proj, key], dim=2)
167
+ value = torch.cat([encoder_hidden_states_value_proj, value], dim=2)
168
+
169
+ if image_rotary_emb is not None:
170
+ query = apply_rotary_emb(query, image_rotary_emb)
171
+ key = apply_rotary_emb(key, image_rotary_emb)
172
+
173
+ if with_cond:
174
+ cond_bs = cond_latents.shape[0]
175
+
176
+ # update condition
177
+ cond_query = attn.to_q(cond_latents)
178
+ cond_query = cond_query.view(cond_bs, -1, attn.heads, head_dim).transpose(1, 2)
179
+ if attn.norm_q is not None:
180
+ cond_query = attn.norm_q(cond_query)
181
+ cond_query = apply_rotary_emb(cond_query, cond_image_rotary_emb)
182
+ cond_query = torch.cat(cond_query.chunk(len(cond_query), dim=0), dim=2)
183
+
184
+ cond_key = attn.to_k(cond_latents)
185
+ cond_value = attn.to_v(cond_latents)
186
+ cond_key = cond_key.view(cond_bs, -1, attn.heads, head_dim).transpose(1, 2)
187
+ cond_value = cond_value.view(cond_bs, -1, attn.heads, head_dim).transpose(1, 2)
188
+ if attn.norm_k is not None:
189
+ cond_key = attn.norm_k(cond_key)
190
+
191
+ cond_key = apply_rotary_emb(cond_key, cond_image_rotary_emb)
192
+
193
+ cond_key = torch.cat(cond_key.chunk(len(cond_key), dim=0), dim=2)
194
+ cond_value = torch.cat(cond_value.chunk(len(cond_value), dim=0), dim=2)
195
+
196
+ if data_num_per_group > 1 and mix_attention:
197
+ E = max_sequence_length # according to text len
198
+
199
+ key_enc, key_hid = key[:, :, :E], key[:, :, E:]
200
+ value_enc, value_hid = value[:, :, :E], value[:, :, E:]
201
+
202
+ key_layer = key_hid.chunk(data_num_per_group, dim=0)
203
+ key_layer = torch.cat(key_layer, dim=2).repeat(data_num_per_group, 1, 1, 1)
204
+
205
+ value_layer = value_hid.chunk(data_num_per_group, dim=0)
206
+ value_layer = torch.cat(value_layer, dim=2).repeat(data_num_per_group, 1, 1, 1)
207
+
208
+ key = torch.cat([key_enc, key_layer], dim=2)
209
+ value = torch.cat([value_enc, value_layer], dim=2)
210
+
211
+ elif data_num_per_group == 1 and mix_attention and with_cond:
212
+ E = max_sequence_length # according to text len
213
+
214
+ key_enc, key_hid = key[:, :, :E], key[:, :, E:]
215
+ value_enc, value_hid = value[:, :, :E], value[:, :, E:]
216
+
217
+ # todo: support bs != 1
218
+ key_layer = torch.cat([key_hid, cond_key], dim=2)
219
+ value_layer = torch.cat([value_hid, cond_value], dim=2)
220
+
221
+ key = torch.cat([key_enc, key_layer], dim=2)
222
+ value = torch.cat([value_enc, value_layer], dim=2)
223
+
224
+ # concat query
225
+ query_enc, query_hid = query[:, :, :E], query[:, :, E:]
226
+ query_layer = torch.cat([query_hid, cond_query], dim=2)
227
+ query = torch.cat([query_enc, query_layer], dim=2)
228
+
229
+ hidden_states = F.scaled_dot_product_attention(
230
+ query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False,
231
+ )
232
+ hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
233
+ hidden_states = hidden_states.to(query.dtype)
234
+
235
+ if encoder_hidden_states is not None:
236
+ if with_cond:
237
+ encoder_hidden_states, hidden_states, cond_latents = (
238
+ hidden_states[:, : encoder_hidden_states.shape[1]],
239
+ hidden_states[:, encoder_hidden_states.shape[1] : -cond_latents.shape[1]*cond_bs],
240
+ hidden_states[:, -cond_latents.shape[1]*cond_bs :],
241
+ )
242
+ cond_latents = cond_latents.view(cond_bs, cond_latents.shape[1] // cond_bs, cond_latents.shape[2])
243
+ cond_latents = attn.to_out[0](cond_latents)
244
+ cond_latents = attn.to_out[1](cond_latents)
245
+ else:
246
+ encoder_hidden_states, hidden_states = (
247
+ hidden_states[:, : encoder_hidden_states.shape[1]],
248
+ hidden_states[:, encoder_hidden_states.shape[1]:],
249
+ )
250
+
251
+ # linear proj
252
+ hidden_states = attn.to_out[0](hidden_states)
253
+ # dropout
254
+ hidden_states = attn.to_out[1](hidden_states)
255
+
256
+ encoder_hidden_states = attn.to_add_out(encoder_hidden_states)
257
+
258
+ if with_cond:
259
+ return hidden_states, encoder_hidden_states, cond_latents
260
+ return hidden_states, encoder_hidden_states
261
+ else:
262
+ if with_cond:
263
+ hidden_states, cond_latents = (
264
+ hidden_states[:, : -cond_latents.shape[1]*cond_bs],
265
+ hidden_states[:, -cond_latents.shape[1]*cond_bs :],
266
+ )
267
+ cond_latents = cond_latents.view(cond_bs, cond_latents.shape[1] // cond_bs, cond_latents.shape[2])
268
+ return hidden_states, cond_latents
269
+ return hidden_states
dreamfuse/models/dreamfuse_flux/transformer.py ADDED
@@ -0,0 +1,866 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2024 Black Forest Labs, The HuggingFace Team and The InstantX Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+
16
+ from typing import Any, Dict, Optional, Tuple, Union
17
+
18
+ import numpy as np
19
+ import torch
20
+ import torch.nn as nn
21
+ import torch.nn.functional as F
22
+
23
+ from diffusers.configuration_utils import ConfigMixin, register_to_config
24
+ from diffusers.loaders import FluxTransformer2DLoadersMixin, FromOriginalModelMixin, PeftAdapterMixin
25
+ from diffusers.models.attention import FeedForward
26
+ from diffusers.models.attention_processor import (
27
+ Attention,
28
+ AttentionProcessor,
29
+ FluxAttnProcessor2_0,
30
+ FluxAttnProcessor2_0_NPU,
31
+ FusedFluxAttnProcessor2_0,
32
+ )
33
+ from dreamfuse.models.dreamfuse_flux.flux_processor import FluxAttnSharedProcessor2_0
34
+ from diffusers.models.modeling_utils import ModelMixin
35
+ from diffusers.models.normalization import AdaLayerNormContinuous, AdaLayerNormZero, AdaLayerNormZeroSingle
36
+ from diffusers.utils import USE_PEFT_BACKEND, is_torch_version, logging, scale_lora_layers, unscale_lora_layers
37
+ from diffusers.utils.import_utils import is_torch_npu_available
38
+ from diffusers.utils.torch_utils import maybe_allow_in_graph
39
+ from diffusers.models.embeddings import CombinedTimestepTextProjEmbeddings, FluxPosEmbed
40
+ from diffusers.models.modeling_outputs import Transformer2DModelOutput
41
+
42
+ from .flux_processor import CombinedTimestepGuidanceTextProjEmbeddings
43
+
44
+ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
45
+
46
+ def zero_module(module):
47
+ for p in module.parameters():
48
+ nn.init.zeros_(p)
49
+ return module
50
+
51
+ class LayerNorm2d(nn.Module):
52
+ def __init__(self, num_channels: int, eps: float = 1e-6) -> None:
53
+ super().__init__()
54
+ self.weight = nn.Parameter(torch.ones(num_channels))
55
+ self.bias = nn.Parameter(torch.zeros(num_channels))
56
+ self.eps = eps
57
+
58
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
59
+ u = x.mean(1, keepdim=True)
60
+ s = (x - u).pow(2).mean(1, keepdim=True)
61
+ x = (x - u) / torch.sqrt(s + self.eps)
62
+ x = self.weight[:, None, None] * x + self.bias[:, None, None]
63
+ return x
64
+
65
+
66
+ class CrossAttention(nn.Module):
67
+ def __init__(self, query_dim: int, cross_attention_dim: int, heads: int = 8, dim_head: int = 64, dropout: float = 0.0, bias: bool = False):
68
+ super().__init__()
69
+ self.heads = heads
70
+ self.dim_head = cross_attention_dim // heads
71
+ self.attn_to_q = nn.Linear(query_dim, cross_attention_dim, bias=bias)
72
+ self.norm_q = nn.LayerNorm(self.dim_head)
73
+
74
+ self.attn_to_k = nn.Linear(cross_attention_dim, cross_attention_dim, bias=bias)
75
+ self.norm_k = nn.LayerNorm(self.dim_head)
76
+
77
+ self.attn_to_v = nn.Linear(cross_attention_dim, cross_attention_dim, bias=bias)
78
+
79
+ self.attn_to_out = nn.ModuleList([])
80
+ self.attn_to_out.append(nn.Linear(query_dim, query_dim, bias=bias))
81
+ self.attn_to_out.append(nn.Dropout(dropout))
82
+
83
+ # zero init
84
+ with torch.no_grad():
85
+ self.attn_to_out[0].weight.fill_(0)
86
+ # self.to_out[0].bias.fill_(0)
87
+
88
+ def forward(self, hidden_states, encoder_hidden_states, attention_mask=None):
89
+ batch_size, sequence_length, _ = hidden_states.shape
90
+
91
+ query = self.attn_to_q(hidden_states)
92
+ key = self.attn_to_k(encoder_hidden_states)
93
+ value = self.attn_to_v(encoder_hidden_states)
94
+
95
+ inner_dim = key.shape[-1]
96
+ head_dim = inner_dim // self.heads
97
+
98
+ query = query.view(batch_size, -1, self.heads, head_dim).transpose(1, 2)
99
+ key = key.view(batch_size, -1, self.heads, head_dim).transpose(1, 2)
100
+ value = value.view(batch_size, -1, self.heads, head_dim).transpose(1, 2)
101
+
102
+ query = self.norm_q(query)
103
+ key = self.norm_k(key)
104
+
105
+ hidden_states = F.scaled_dot_product_attention(query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False,)
106
+ hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, self.heads * head_dim)
107
+
108
+ hidden_states = self.attn_to_out[0](hidden_states)
109
+ hidden_states = self.attn_to_out[1](hidden_states)
110
+
111
+ return hidden_states
112
+
113
+ @maybe_allow_in_graph
114
+ class FluxSingleTransformerBlock(nn.Module):
115
+ r"""
116
+ A Transformer block following the MMDiT architecture, introduced in Stable Diffusion 3.
117
+
118
+ Reference: https://arxiv.org/abs/2403.03206
119
+
120
+ Parameters:
121
+ dim (`int`): The number of channels in the input and output.
122
+ num_attention_heads (`int`): The number of heads to use for multi-head attention.
123
+ attention_head_dim (`int`): The number of channels in each head.
124
+ context_pre_only (`bool`): Boolean to determine if we should add some blocks associated with the
125
+ processing of `context` conditions.
126
+ """
127
+
128
+ def __init__(self, dim, num_attention_heads, attention_head_dim, mlp_ratio=4.0):
129
+ super().__init__()
130
+ self.mlp_hidden_dim = int(dim * mlp_ratio)
131
+
132
+ self.norm = AdaLayerNormZeroSingle(dim)
133
+ self.proj_mlp = nn.Linear(dim, self.mlp_hidden_dim)
134
+ self.act_mlp = nn.GELU(approximate="tanh")
135
+ self.proj_out = nn.Linear(dim + self.mlp_hidden_dim, dim)
136
+
137
+ processor = FluxAttnSharedProcessor2_0()
138
+
139
+ self.attn = Attention(
140
+ query_dim=dim,
141
+ cross_attention_dim=None,
142
+ dim_head=attention_head_dim,
143
+ heads=num_attention_heads,
144
+ out_dim=dim,
145
+ bias=True,
146
+ processor=processor,
147
+ qk_norm="rms_norm",
148
+ eps=1e-6,
149
+ pre_only=True,
150
+ )
151
+
152
+ def forward(
153
+ self,
154
+ hidden_states: torch.FloatTensor,
155
+ temb: torch.FloatTensor,
156
+ image_rotary_emb=None,
157
+ data_num_per_group=1,
158
+ max_sequence_length=512,
159
+ mix_attention: bool = True,
160
+ cond_temb = None,
161
+ cond_image_rotary_emb = None,
162
+ cond_latents = None,
163
+ joint_attention_kwargs=None,
164
+
165
+ ):
166
+ with_cond = cond_latents is not None and mix_attention
167
+
168
+ residual = hidden_states
169
+ norm_hidden_states, gate = self.norm(hidden_states, emb=temb)
170
+ mlp_hidden_states = self.act_mlp(self.proj_mlp(norm_hidden_states))
171
+
172
+ if with_cond:
173
+ residual_cond = cond_latents
174
+ norm_cond_latents, cond_gate = self.norm(cond_latents, emb=cond_temb)
175
+ mlp_cond_hidden_states = self.act_mlp(self.proj_mlp(norm_cond_latents))
176
+
177
+ joint_attention_kwargs = joint_attention_kwargs or {}
178
+ attn_output = self.attn(
179
+ hidden_states=norm_hidden_states,
180
+ image_rotary_emb=image_rotary_emb,
181
+ data_num_per_group=data_num_per_group,
182
+ max_sequence_length=max_sequence_length,
183
+ mix_attention=mix_attention,
184
+ cond_latents=norm_cond_latents if with_cond else None,
185
+ cond_image_rotary_emb=cond_image_rotary_emb if with_cond else None,
186
+ **joint_attention_kwargs,
187
+ )
188
+
189
+ if with_cond:
190
+ attn_output, cond_attn_output = attn_output
191
+
192
+ hidden_states = torch.cat([attn_output, mlp_hidden_states], dim=2)
193
+ gate = gate.unsqueeze(1)
194
+ hidden_states = gate * self.proj_out(hidden_states)
195
+ hidden_states = residual + hidden_states
196
+
197
+ if with_cond:
198
+ cond_latents = torch.cat([cond_attn_output, mlp_cond_hidden_states], dim=2)
199
+ cond_gate = cond_gate.unsqueeze(1)
200
+ cond_latents = cond_gate * self.proj_out(cond_latents)
201
+ cond_latents = residual_cond + cond_latents
202
+
203
+ if hidden_states.dtype == torch.float16:
204
+ hidden_states = hidden_states.clip(-65504, 65504)
205
+
206
+ if with_cond:
207
+ return hidden_states, cond_latents
208
+ else:
209
+ return hidden_states
210
+
211
+
212
+ @maybe_allow_in_graph
213
+ class FluxTransformerBlock(nn.Module):
214
+ r"""
215
+ A Transformer block following the MMDiT architecture, introduced in Stable Diffusion 3.
216
+
217
+ Reference: https://arxiv.org/abs/2403.03206
218
+
219
+ Parameters:
220
+ dim (`int`): The number of channels in the input and output.
221
+ num_attention_heads (`int`): The number of heads to use for multi-head attention.
222
+ attention_head_dim (`int`): The number of channels in each head.
223
+ context_pre_only (`bool`): Boolean to determine if we should add some blocks associated with the
224
+ processing of `context` conditions.
225
+ """
226
+
227
+ def __init__(self, dim, num_attention_heads, attention_head_dim, qk_norm="rms_norm", eps=1e-6):
228
+ super().__init__()
229
+
230
+ self.norm1 = AdaLayerNormZero(dim)
231
+
232
+ self.norm1_context = AdaLayerNormZero(dim)
233
+
234
+ processor = FluxAttnSharedProcessor2_0()
235
+
236
+ self.attn = Attention(
237
+ query_dim=dim,
238
+ cross_attention_dim=None,
239
+ added_kv_proj_dim=dim,
240
+ dim_head=attention_head_dim,
241
+ heads=num_attention_heads,
242
+ out_dim=dim,
243
+ context_pre_only=False,
244
+ bias=True,
245
+ processor=processor,
246
+ qk_norm=qk_norm,
247
+ eps=eps,
248
+ )
249
+
250
+ self.norm2 = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
251
+ self.ff = FeedForward(dim=dim, dim_out=dim, activation_fn="gelu-approximate")
252
+
253
+ self.norm2_context = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
254
+ self.ff_context = FeedForward(dim=dim, dim_out=dim, activation_fn="gelu-approximate")
255
+
256
+ # let chunk size default to None
257
+ self._chunk_size = None
258
+ self._chunk_dim = 0
259
+
260
+ def forward(
261
+ self,
262
+ hidden_states: torch.FloatTensor,
263
+ encoder_hidden_states: torch.FloatTensor,
264
+ temb: torch.FloatTensor,
265
+ image_rotary_emb=None,
266
+ data_num_per_group=1,
267
+ max_sequence_length=512,
268
+ mix_attention: bool = True,
269
+ cond_temb = None,
270
+ cond_image_rotary_emb = None,
271
+ cond_latents = None,
272
+ joint_attention_kwargs=None,
273
+ ):
274
+ norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(hidden_states, emb=temb)
275
+
276
+ norm_encoder_hidden_states, c_gate_msa, c_shift_mlp, c_scale_mlp, c_gate_mlp = self.norm1_context(
277
+ encoder_hidden_states, emb=temb
278
+ )
279
+ joint_attention_kwargs = joint_attention_kwargs or {}
280
+
281
+ with_cond = cond_latents is not None and mix_attention
282
+ if with_cond:
283
+ norm_cond_latents, cond_gate_msa, cond_shift_mlp, cond_scale_mlp, cond_gate_mlp = self.norm1(cond_latents, emb=cond_temb)
284
+
285
+ # Attention.
286
+ attention_outputs = self.attn(
287
+ hidden_states=norm_hidden_states,
288
+ encoder_hidden_states=norm_encoder_hidden_states,
289
+ image_rotary_emb=image_rotary_emb,
290
+ data_num_per_group=data_num_per_group,
291
+ max_sequence_length=max_sequence_length,
292
+ mix_attention=mix_attention,
293
+ cond_latents=norm_cond_latents if with_cond else None,
294
+ cond_image_rotary_emb=cond_image_rotary_emb if with_cond else None,
295
+ **joint_attention_kwargs,
296
+ )
297
+
298
+ if len(attention_outputs) == 2:
299
+ attn_output, context_attn_output = attention_outputs
300
+ elif len(attention_outputs) == 3 and with_cond:
301
+ attn_output, context_attn_output, cond_attn_output = attention_outputs
302
+ elif len(attention_outputs) == 3:
303
+ attn_output, context_attn_output, ip_attn_output = attention_outputs
304
+
305
+ # Process attention outputs for the `hidden_states`.
306
+ attn_output = gate_msa.unsqueeze(1) * attn_output
307
+ hidden_states = hidden_states + attn_output
308
+
309
+ norm_hidden_states = self.norm2(hidden_states)
310
+ norm_hidden_states = norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
311
+
312
+ ff_output = self.ff(norm_hidden_states)
313
+ ff_output = gate_mlp.unsqueeze(1) * ff_output
314
+
315
+ hidden_states = hidden_states + ff_output
316
+ if len(attention_outputs) == 3 and not with_cond:
317
+ hidden_states = hidden_states + ip_attn_output
318
+
319
+ if with_cond:
320
+ cond_attn_output = cond_gate_msa.unsqueeze(1) * cond_attn_output
321
+ cond_latents = cond_latents + cond_attn_output
322
+
323
+ norm_cond_latents = self.norm2(cond_latents)
324
+ norm_cond_latents = norm_cond_latents * (1 + cond_scale_mlp[:, None]) + cond_shift_mlp[:, None]
325
+
326
+ cond_ff_output = self.ff(norm_cond_latents)
327
+ cond_ff_output = cond_gate_mlp.unsqueeze(1) * cond_ff_output
328
+
329
+ cond_latents = cond_latents + cond_ff_output
330
+ # Process attention outputs for the `encoder_hidden_states`.
331
+
332
+ context_attn_output = c_gate_msa.unsqueeze(1) * context_attn_output
333
+ encoder_hidden_states = encoder_hidden_states + context_attn_output
334
+
335
+ norm_encoder_hidden_states = self.norm2_context(encoder_hidden_states)
336
+ norm_encoder_hidden_states = norm_encoder_hidden_states * (1 + c_scale_mlp[:, None]) + c_shift_mlp[:, None]
337
+
338
+ context_ff_output = self.ff_context(norm_encoder_hidden_states)
339
+ encoder_hidden_states = encoder_hidden_states + c_gate_mlp.unsqueeze(1) * context_ff_output
340
+ if encoder_hidden_states.dtype == torch.float16:
341
+ encoder_hidden_states = encoder_hidden_states.clip(-65504, 65504)
342
+
343
+ if with_cond:
344
+ return encoder_hidden_states, hidden_states, cond_latents
345
+ else:
346
+ return encoder_hidden_states, hidden_states
347
+
348
+
349
+ class FluxTransformer2DModel(
350
+ ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginalModelMixin, FluxTransformer2DLoadersMixin
351
+ ):
352
+ """
353
+ The Transformer model introduced in Flux.
354
+
355
+ Reference: https://blackforestlabs.ai/announcing-black-forest-labs/
356
+
357
+ Parameters:
358
+ patch_size (`int`): Patch size to turn the input data into small patches.
359
+ in_channels (`int`, *optional*, defaults to 16): The number of channels in the input.
360
+ num_layers (`int`, *optional*, defaults to 18): The number of layers of MMDiT blocks to use.
361
+ num_single_layers (`int`, *optional*, defaults to 18): The number of layers of single DiT blocks to use.
362
+ attention_head_dim (`int`, *optional*, defaults to 64): The number of channels in each head.
363
+ num_attention_heads (`int`, *optional*, defaults to 18): The number of heads to use for multi-head attention.
364
+ joint_attention_dim (`int`, *optional*): The number of `encoder_hidden_states` dimensions to use.
365
+ pooled_projection_dim (`int`): Number of dimensions to use when projecting the `pooled_projections`.
366
+ guidance_embeds (`bool`, defaults to False): Whether to use guidance embeddings.
367
+ """
368
+
369
+ _supports_gradient_checkpointing = True
370
+ _no_split_modules = ["FluxTransformerBlock", "FluxSingleTransformerBlock"]
371
+
372
+ @register_to_config
373
+ def __init__(
374
+ self,
375
+ patch_size: int = 1,
376
+ in_channels: int = 64,
377
+ out_channels: Optional[int] = None,
378
+ num_layers: int = 19,
379
+ num_single_layers: int = 38,
380
+ attention_head_dim: int = 128,
381
+ num_attention_heads: int = 24,
382
+ joint_attention_dim: int = 4096,
383
+ pooled_projection_dim: int = 768,
384
+ guidance_embeds: bool = False,
385
+ axes_dims_rope: Tuple[int] = (16, 56, 56),
386
+ ):
387
+ super().__init__()
388
+ self.out_channels = out_channels or in_channels
389
+ self.inner_dim = self.config.num_attention_heads * self.config.attention_head_dim
390
+ if getattr(self.config, "num_image_tag_embeddings", None) is not None:
391
+ self.image_tag_embeddings = nn.Embedding(self.config.num_image_tag_embeddings, self.inner_dim)
392
+ if getattr(self.config, "num_context_tag_embeddings", None) is not None:
393
+ self.context_tag_embeddings = nn.Embedding(self.config.num_context_tag_embeddings, self.inner_dim)
394
+
395
+ self.pos_embed = FluxPosEmbed(theta=10000, axes_dim=axes_dims_rope)
396
+
397
+ text_time_guidance_cls = (
398
+ CombinedTimestepGuidanceTextProjEmbeddings if guidance_embeds else CombinedTimestepTextProjEmbeddings
399
+ )
400
+ self.time_text_embed = text_time_guidance_cls(
401
+ embedding_dim=self.inner_dim, pooled_projection_dim=self.config.pooled_projection_dim
402
+ )
403
+
404
+ self.context_embedder = nn.Linear(self.config.joint_attention_dim, self.inner_dim)
405
+ self.x_embedder = nn.Linear(self.config.in_channels, self.inner_dim)
406
+
407
+ self.transformer_blocks = nn.ModuleList(
408
+ [
409
+ FluxTransformerBlock(
410
+ dim=self.inner_dim,
411
+ num_attention_heads=self.config.num_attention_heads,
412
+ attention_head_dim=self.config.attention_head_dim,
413
+ )
414
+ for i in range(self.config.num_layers)
415
+ ]
416
+ )
417
+
418
+ self.single_transformer_blocks = nn.ModuleList(
419
+ [
420
+ FluxSingleTransformerBlock(
421
+ dim=self.inner_dim,
422
+ num_attention_heads=self.config.num_attention_heads,
423
+ attention_head_dim=self.config.attention_head_dim,
424
+ )
425
+ for i in range(self.config.num_single_layers)
426
+ ]
427
+ )
428
+
429
+ self.norm_out = AdaLayerNormContinuous(self.inner_dim, self.inner_dim, elementwise_affine=False, eps=1e-6)
430
+ self.proj_out = nn.Linear(self.inner_dim, patch_size * patch_size * self.out_channels, bias=True)
431
+
432
+ self.gradient_checkpointing = False
433
+
434
+ def set_tag_embeddings(self, num_image_tag_embeddings=0, num_context_tag_embeddings=0):
435
+ if num_image_tag_embeddings > 0:
436
+ self.config.num_image_tag_embeddings = num_image_tag_embeddings
437
+ self.image_tag_embeddings = zero_module(nn.Embedding(self.config.num_image_tag_embeddings, self.inner_dim))
438
+ if num_context_tag_embeddings > 0:
439
+ self.config.num_context_tag_embeddings = num_context_tag_embeddings
440
+ self.context_tag_embeddings = zero_module(nn.Embedding(self.config.num_context_tag_embeddings, self.inner_dim))
441
+
442
+ def set_mask_tokenizer(self, mask_in_chans, mask_out_chans, activation = nn.GELU):
443
+ self.mask_tokenizer = nn.Sequential(
444
+ nn.Conv2d(1, mask_in_chans // 4, kernel_size=2, stride=2),
445
+ LayerNorm2d(mask_in_chans // 4),
446
+ activation(),
447
+ nn.Conv2d(mask_in_chans // 4, mask_in_chans, kernel_size=3, padding=1),
448
+ LayerNorm2d(mask_in_chans),
449
+ activation(),
450
+ nn.Conv2d(mask_in_chans, mask_out_chans, kernel_size=1),
451
+ nn.AdaptiveAvgPool2d((16, 16))
452
+ )
453
+
454
+ self.mask_attn = CrossAttention(mask_out_chans, mask_out_chans)
455
+
456
+ def forward_mask_attn(self, mask_images, fg_images):
457
+ mask_images = self.mask_tokenizer(mask_images)
458
+ mask_images = mask_images.flatten(2).transpose(1, 2)
459
+ mask_images = self.mask_attn(mask_images, fg_images, attention_mask=None)
460
+ return mask_images
461
+
462
+ @property
463
+ # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors
464
+ def attn_processors(self) -> Dict[str, AttentionProcessor]:
465
+ r"""
466
+ Returns:
467
+ `dict` of attention processors: A dictionary containing all attention processors used in the model with
468
+ indexed by its weight name.
469
+ """
470
+ # set recursively
471
+ processors = {}
472
+
473
+ def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
474
+ if hasattr(module, "get_processor"):
475
+ processors[f"{name}.processor"] = module.get_processor()
476
+
477
+ for sub_name, child in module.named_children():
478
+ fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
479
+
480
+ return processors
481
+
482
+ for name, module in self.named_children():
483
+ fn_recursive_add_processors(name, module, processors)
484
+
485
+ return processors
486
+
487
+ # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor
488
+ def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
489
+ r"""
490
+ Sets the attention processor to use to compute attention.
491
+
492
+ Parameters:
493
+ processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
494
+ The instantiated processor class or a dictionary of processor classes that will be set as the processor
495
+ for **all** `Attention` layers.
496
+
497
+ If `processor` is a dict, the key needs to define the path to the corresponding cross attention
498
+ processor. This is strongly recommended when setting trainable attention processors.
499
+
500
+ """
501
+ count = len(self.attn_processors.keys())
502
+
503
+ if isinstance(processor, dict) and len(processor) != count:
504
+ raise ValueError(
505
+ f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
506
+ f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
507
+ )
508
+
509
+ def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
510
+ if hasattr(module, "set_processor"):
511
+ if not isinstance(processor, dict):
512
+ module.set_processor(processor)
513
+ else:
514
+ module.set_processor(processor.pop(f"{name}.processor"))
515
+
516
+ for sub_name, child in module.named_children():
517
+ fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
518
+
519
+ for name, module in self.named_children():
520
+ fn_recursive_attn_processor(name, module, processor)
521
+
522
+ # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.fuse_qkv_projections with FusedAttnProcessor2_0->FusedFluxAttnProcessor2_0
523
+ def fuse_qkv_projections(self):
524
+ """
525
+ Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, key, value)
526
+ are fused. For cross-attention modules, key and value projection matrices are fused.
527
+
528
+ <Tip warning={true}>
529
+
530
+ This API is 🧪 experimental.
531
+
532
+ </Tip>
533
+ """
534
+ self.original_attn_processors = None
535
+
536
+ for _, attn_processor in self.attn_processors.items():
537
+ if "Added" in str(attn_processor.__class__.__name__):
538
+ raise ValueError("`fuse_qkv_projections()` is not supported for models having added KV projections.")
539
+
540
+ self.original_attn_processors = self.attn_processors
541
+
542
+ for module in self.modules():
543
+ if isinstance(module, Attention):
544
+ module.fuse_projections(fuse=True)
545
+
546
+ self.set_attn_processor(FusedFluxAttnProcessor2_0())
547
+
548
+ # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.unfuse_qkv_projections
549
+ def unfuse_qkv_projections(self):
550
+ """Disables the fused QKV projection if enabled.
551
+
552
+ <Tip warning={true}>
553
+
554
+ This API is 🧪 experimental.
555
+
556
+ </Tip>
557
+
558
+ """
559
+ if self.original_attn_processors is not None:
560
+ self.set_attn_processor(self.original_attn_processors)
561
+
562
+ def _set_gradient_checkpointing(self, module, value=False):
563
+ if hasattr(module, "gradient_checkpointing"):
564
+ module.gradient_checkpointing = value
565
+
566
+ def _format_input(self):
567
+ pass
568
+
569
+ def _format_output(self):
570
+ pass
571
+
572
+ def forward(
573
+ self,
574
+ hidden_states: torch.Tensor,
575
+ encoder_hidden_states: torch.Tensor = None,
576
+ cond_input: dict = None,
577
+ pooled_projections: torch.Tensor = None,
578
+ timestep: torch.LongTensor = None,
579
+ img_ids: torch.Tensor = None,
580
+ txt_ids: torch.Tensor = None,
581
+ guidance: torch.Tensor = None,
582
+ joint_attention_kwargs: Optional[Dict[str, Any]] = None,
583
+ controlnet_block_samples=None,
584
+ controlnet_single_block_samples=None,
585
+ return_dict: bool = True,
586
+ controlnet_blocks_repeat: bool = False,
587
+ data_num_per_group: int = 1,
588
+ image_tags=None,
589
+ context_tags=None,
590
+ max_sequence_length: int = 512,
591
+ mix_attention_double=True,
592
+ mix_attention_single=True,
593
+ ) -> Union[torch.FloatTensor, Transformer2DModelOutput]:
594
+ """
595
+ The [`FluxTransformer2DModel`] forward method.
596
+
597
+ Args:
598
+ hidden_states (`torch.FloatTensor` of shape `(batch size, channel, height, width)`):
599
+ Input `hidden_states`.
600
+ encoder_hidden_states (`torch.FloatTensor` of shape `(batch size, sequence_len, embed_dims)`):
601
+ Conditional embeddings (embeddings computed from the input conditions such as prompts) to use.
602
+ pooled_projections (`torch.FloatTensor` of shape `(batch_size, projection_dim)`): Embeddings projected
603
+ from the embeddings of input conditions.
604
+ timestep ( `torch.LongTensor`):
605
+ Used to indicate denoising step.
606
+ block_controlnet_hidden_states: (`list` of `torch.Tensor`):
607
+ A list of tensors that if specified are added to the residuals of transformer blocks.
608
+ joint_attention_kwargs (`dict`, *optional*):
609
+ A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
610
+ `self.processor` in
611
+ [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
612
+ return_dict (`bool`, *optional*, defaults to `True`):
613
+ Whether or not to return a [`~models.transformer_2d.Transformer2DModelOutput`] instead of a plain
614
+ tuple.
615
+
616
+ Returns:
617
+ If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a
618
+ `tuple` where the first element is the sample tensor.
619
+ """
620
+ if joint_attention_kwargs is not None:
621
+ joint_attention_kwargs = joint_attention_kwargs.copy()
622
+ lora_scale = joint_attention_kwargs.pop("scale", 1.0)
623
+ else:
624
+ lora_scale = 1.0
625
+
626
+ if USE_PEFT_BACKEND:
627
+ # weight the lora layers by setting `lora_scale` for each PEFT layer
628
+ scale_lora_layers(self, lora_scale)
629
+ else:
630
+ if joint_attention_kwargs is not None and joint_attention_kwargs.get("scale", None) is not None:
631
+ logger.warning(
632
+ "Passing `scale` via `joint_attention_kwargs` when not using the PEFT backend is ineffective."
633
+ )
634
+
635
+ hidden_states = self.x_embedder(hidden_states)
636
+
637
+ mask_cond = None
638
+ mask_ids = None
639
+ if cond_input is not None:
640
+ cond_image_latents = cond_input["image_latents"]
641
+ cond_image_ids = cond_input["image_ids"]
642
+ cond_latents = self.x_embedder(cond_image_latents)
643
+
644
+ if joint_attention_kwargs is not None and "mask_cond" in joint_attention_kwargs:
645
+ mask_cond = joint_attention_kwargs.pop("mask_cond")
646
+ mask_ids = joint_attention_kwargs.pop("mask_ids")
647
+ if mask_cond is not None:
648
+ mask_cond = self.forward_mask_attn(mask_cond, cond_latents[:1])
649
+ # joint_attention_kwargs["mask_cond"] = mask_cond
650
+ # hidden_states = hidden_states + mask_cond
651
+
652
+ if image_tags is not None:
653
+ image_tag_embeddings = self.image_tag_embeddings(
654
+ torch.Tensor(
655
+ image_tags,
656
+ ).to(device=hidden_states.device, dtype=torch.int64)
657
+ )
658
+ bsz = hidden_states.shape[0] // data_num_per_group
659
+ image_tag_embeddings = image_tag_embeddings.repeat_interleave(bsz, dim=0)
660
+ if cond_input is not None:
661
+ hidden_states = hidden_states + image_tag_embeddings[0]
662
+ cond_latents = cond_latents + image_tag_embeddings[1:].unsqueeze(1)
663
+ else:
664
+ # for debug
665
+ if len(hidden_states) != len(image_tag_embeddings):
666
+ hidden_states += image_tag_embeddings[:1].unsqueeze(1)
667
+ else:
668
+ hidden_states = hidden_states + image_tag_embeddings.unsqueeze(1)
669
+
670
+ timestep = timestep.to(hidden_states.dtype) * 1000
671
+ if guidance is not None:
672
+ guidance = guidance.to(hidden_states.dtype) * 1000
673
+ else:
674
+ guidance = None
675
+
676
+ temb = (
677
+ self.time_text_embed(timestep, pooled_projections)
678
+ if guidance is None
679
+ else self.time_text_embed(timestep, guidance, pooled_projections)
680
+ )
681
+ if cond_input is not None:
682
+ cond_time = 0
683
+ cond_temb = ( self.time_text_embed(torch.ones_like(timestep)*cond_time, pooled_projections)
684
+ if guidance is None
685
+ else self.time_text_embed(torch.ones_like(timestep)*cond_time, guidance, pooled_projections)
686
+ )
687
+ encoder_hidden_states = self.context_embedder(encoder_hidden_states)
688
+
689
+ if context_tags is not None:
690
+ context_tag_embeddings = self.context_tag_embeddings(
691
+ torch.Tensor(
692
+ image_tags,
693
+ ).to(device=hidden_states.device, dtype=torch.int64)
694
+ )
695
+ bsz = hidden_states.shape[0] // data_num_per_group
696
+ context_tag_embeddings = context_tag_embeddings.repeat_interleave(bsz, dim=0)
697
+ if cond_input is not None:
698
+ encoder_hidden_states = encoder_hidden_states + context_tag_embeddings[0]
699
+ else:
700
+ if len(encoder_hidden_states) != len(context_tag_embeddings):
701
+ encoder_hidden_states += context_tag_embeddings[:1].unsqueeze(1)
702
+ else:
703
+ encoder_hidden_states = encoder_hidden_states + context_tag_embeddings.unsqueeze(1)
704
+
705
+ if mask_cond is not None:
706
+ encoder_hidden_states = torch.cat([encoder_hidden_states, mask_cond], dim=1) # todo: compare with add
707
+ max_sequence_length = encoder_hidden_states.shape[1]
708
+
709
+ txt_ids = torch.cat((txt_ids, mask_ids), dim=0)
710
+
711
+ if isinstance(img_ids, list):
712
+ image_rotary_emb = []
713
+ for img_ids_ in img_ids:
714
+ ids = torch.cat((txt_ids, img_ids_), dim=0)
715
+ image_rotary_emb.append(self.pos_embed(ids))
716
+ image_rotary_emb = ( # to batch, cos / sin
717
+ torch.stack([_[0] for _ in image_rotary_emb]).repeat_interleave(hidden_states.shape[0] // len(img_ids), dim=0).clone(),
718
+ torch.stack([_[1] for _ in image_rotary_emb]).repeat_interleave(hidden_states.shape[0] // len(img_ids), dim=0).clone(),
719
+ )
720
+ else:
721
+ ids = torch.cat((txt_ids, img_ids), dim=0)
722
+ image_rotary_emb = self.pos_embed(ids)
723
+ if cond_input is not None:
724
+ cond_rotary_emb = []
725
+ for image_ids in cond_image_ids:
726
+ cond_rotary_emb.append(self.pos_embed(image_ids))
727
+ cond_rotary_emb = (
728
+ torch.stack([_[0] for _ in cond_rotary_emb]).repeat_interleave(cond_latents.shape[0] // len(cond_image_ids), dim=0).clone(),
729
+ torch.stack([_[1] for _ in cond_rotary_emb]).repeat_interleave(cond_latents.shape[0] // len(cond_image_ids), dim=0).clone(),
730
+ )
731
+
732
+ if joint_attention_kwargs is not None and "ip_adapter_image_embeds" in joint_attention_kwargs:
733
+ ip_adapter_image_embeds = joint_attention_kwargs.pop("ip_adapter_image_embeds")
734
+ ip_hidden_states = self.encoder_hid_proj(ip_adapter_image_embeds)
735
+ joint_attention_kwargs.update({"ip_hidden_states": ip_hidden_states})
736
+
737
+ for index_block, block in enumerate(self.transformer_blocks):
738
+ if torch.is_grad_enabled() and self.gradient_checkpointing:
739
+
740
+ def create_custom_forward(module, return_dict=None):
741
+ def custom_forward(*inputs):
742
+ if return_dict is not None:
743
+ return module(*inputs, return_dict=return_dict)
744
+ else:
745
+ return module(*inputs)
746
+
747
+ return custom_forward
748
+
749
+ ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
750
+ # ckpt_kwargs.updata(joint_attention_kwargs)
751
+ block_output = torch.utils.checkpoint.checkpoint(
752
+ create_custom_forward(block),
753
+ hidden_states,
754
+ encoder_hidden_states,
755
+ temb,
756
+ image_rotary_emb,
757
+ data_num_per_group,
758
+ max_sequence_length,
759
+ mix_attention_double,
760
+ cond_temb if cond_input is not None else None,
761
+ cond_rotary_emb if cond_input is not None else None,
762
+ cond_latents if cond_input is not None else None,
763
+ joint_attention_kwargs,
764
+ **ckpt_kwargs,
765
+ )
766
+ else:
767
+ block_output = block(
768
+ hidden_states=hidden_states,
769
+ encoder_hidden_states=encoder_hidden_states,
770
+ temb=temb,
771
+ image_rotary_emb=image_rotary_emb,
772
+ data_num_per_group=data_num_per_group,
773
+ max_sequence_length=max_sequence_length,
774
+ mix_attention=mix_attention_double,
775
+ cond_temb = cond_temb if cond_input is not None else None,
776
+ cond_image_rotary_emb = cond_rotary_emb if cond_input is not None else None,
777
+ cond_latents = cond_latents if cond_input is not None else None,
778
+ joint_attention_kwargs=joint_attention_kwargs,
779
+ )
780
+
781
+ if cond_input is not None and mix_attention_double:
782
+ encoder_hidden_states, hidden_states, cond_latents = block_output
783
+ else:
784
+ encoder_hidden_states, hidden_states = block_output
785
+
786
+ # controlnet residual
787
+ if controlnet_block_samples is not None:
788
+ interval_control = len(self.transformer_blocks) / len(controlnet_block_samples)
789
+ interval_control = int(np.ceil(interval_control))
790
+ # For Xlabs ControlNet.
791
+ if controlnet_blocks_repeat:
792
+ hidden_states = (
793
+ hidden_states + controlnet_block_samples[index_block % len(controlnet_block_samples)]
794
+ )
795
+ else:
796
+ hidden_states = hidden_states + controlnet_block_samples[index_block // interval_control]
797
+
798
+ hidden_states = torch.cat([encoder_hidden_states, hidden_states], dim=1)
799
+
800
+ for index_block, block in enumerate(self.single_transformer_blocks):
801
+ if torch.is_grad_enabled() and self.gradient_checkpointing:
802
+
803
+ def create_custom_forward(module, return_dict=None):
804
+ def custom_forward(*inputs):
805
+ if return_dict is not None:
806
+ return module(*inputs, return_dict=return_dict)
807
+ else:
808
+ return module(*inputs)
809
+
810
+ return custom_forward
811
+
812
+ ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
813
+ hidden_states = torch.utils.checkpoint.checkpoint(
814
+ create_custom_forward(block),
815
+ hidden_states,
816
+ temb,
817
+ image_rotary_emb,
818
+ data_num_per_group,
819
+ max_sequence_length,
820
+ mix_attention_single,
821
+ cond_temb if cond_input is not None else None,
822
+ cond_rotary_emb if cond_input is not None else None,
823
+ cond_latents if cond_input is not None else None,
824
+ joint_attention_kwargs,
825
+ **ckpt_kwargs,
826
+ )
827
+
828
+ else:
829
+ hidden_states = block(
830
+ hidden_states=hidden_states,
831
+ temb=temb,
832
+ image_rotary_emb=image_rotary_emb,
833
+ data_num_per_group=data_num_per_group,
834
+ max_sequence_length=max_sequence_length,
835
+ mix_attention=mix_attention_single,
836
+ cond_temb = cond_temb if cond_input is not None else None,
837
+ cond_image_rotary_emb = cond_rotary_emb if cond_input is not None else None,
838
+ cond_latents = cond_latents if cond_input is not None else None,
839
+ joint_attention_kwargs=joint_attention_kwargs,
840
+ )
841
+
842
+ if cond_input is not None and mix_attention_single:
843
+ hidden_states, cond_latents = hidden_states
844
+
845
+ # controlnet residual
846
+ if controlnet_single_block_samples is not None:
847
+ interval_control = len(self.single_transformer_blocks) / len(controlnet_single_block_samples)
848
+ interval_control = int(np.ceil(interval_control))
849
+ hidden_states[:, encoder_hidden_states.shape[1]:, ...] = (
850
+ hidden_states[:, encoder_hidden_states.shape[1]:, ...]
851
+ + controlnet_single_block_samples[index_block // interval_control]
852
+ )
853
+
854
+ hidden_states = hidden_states[:, encoder_hidden_states.shape[1]:, ...]
855
+
856
+ hidden_states = self.norm_out(hidden_states, temb)
857
+ output = self.proj_out(hidden_states)
858
+
859
+ if USE_PEFT_BACKEND:
860
+ # remove `lora_scale` from each PEFT layer
861
+ unscale_lora_layers(self, lora_scale)
862
+
863
+ if not return_dict:
864
+ return (output,)
865
+
866
+ return Transformer2DModelOutput(sample=output)
dreamfuse/trains/utils/__pycache__/inference_utils.cpython-310.pyc ADDED
Binary file (8.68 kB). View file
 
dreamfuse/trains/utils/inference_utils.py ADDED
@@ -0,0 +1,386 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from diffusers.utils.torch_utils import randn_tensor
3
+ import numpy as np
4
+ from einops import rearrange
5
+ import torch.nn.functional as F
6
+
7
+ def get_mask_affine(mask1, mask2):
8
+ box1 = mask1.getbbox()
9
+ box2 = mask2.getbbox()
10
+
11
+ if box1 is None or box2 is None:
12
+ affine_coeffs = [1, 0, 0, 0, 1, 0]
13
+ return affine_coeffs
14
+
15
+ left1, top1, right1, bottom1 = box1
16
+ left2, top2, right2, bottom2 = box2
17
+
18
+ w1, h1 = right1 - left1, bottom1 - top1
19
+ w2, h2 = right2 - left2, bottom2 - top2
20
+
21
+ scale_x = w1 / w2
22
+ scale_y = h1 / h2
23
+
24
+ tx = left1 - left2*scale_x
25
+ ty = top1 - top2*scale_y
26
+
27
+ affine_coeffs = [scale_x, 0, tx, 0, scale_y, ty]
28
+ return affine_coeffs
29
+
30
+ def tokenize_prompt(tokenizer, prompt, max_sequence_length):
31
+ text_inputs = tokenizer(
32
+ prompt,
33
+ padding="max_length",
34
+ max_length=max_sequence_length,
35
+ truncation=True,
36
+ return_length=False,
37
+ return_overflowing_tokens=False,
38
+ return_tensors="pt",
39
+ )
40
+ text_input_ids = text_inputs.input_ids
41
+ return text_input_ids
42
+
43
+
44
+ def _encode_prompt_with_t5(
45
+ text_encoder,
46
+ tokenizer,
47
+ max_sequence_length=512,
48
+ prompt=None,
49
+ num_images_per_prompt=1,
50
+ device=None,
51
+ text_input_ids=None,
52
+ ):
53
+ prompt = [prompt] if isinstance(prompt, str) else prompt
54
+ batch_size = len(prompt)
55
+
56
+ if tokenizer is not None:
57
+ text_inputs = tokenizer(
58
+ prompt,
59
+ padding="max_length",
60
+ max_length=max_sequence_length,
61
+ truncation=True,
62
+ return_length=False,
63
+ return_overflowing_tokens=False,
64
+ return_tensors="pt",
65
+ )
66
+ text_input_ids = text_inputs.input_ids
67
+ else:
68
+ if text_input_ids is None:
69
+ raise ValueError("text_input_ids must be provided when the tokenizer is not specified")
70
+ prompt_embeds = text_encoder(text_input_ids.to(device))[0]
71
+
72
+ dtype = text_encoder.dtype
73
+ prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
74
+
75
+ _, seq_len, _ = prompt_embeds.shape
76
+
77
+ # duplicate text embeddings and attention mask for each generation per prompt, using mps friendly method
78
+ prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
79
+ prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
80
+
81
+ return prompt_embeds
82
+
83
+
84
+ def _encode_prompt_with_clip(
85
+ text_encoder,
86
+ tokenizer,
87
+ prompt: str,
88
+ device=None,
89
+ text_input_ids=None,
90
+ num_images_per_prompt: int = 1,
91
+ ):
92
+ prompt = [prompt] if isinstance(prompt, str) else prompt
93
+ batch_size = len(prompt)
94
+
95
+ if tokenizer is not None:
96
+ text_inputs = tokenizer(
97
+ prompt,
98
+ padding="max_length",
99
+ max_length=77,
100
+ truncation=True,
101
+ return_overflowing_tokens=False,
102
+ return_length=False,
103
+ return_tensors="pt",
104
+ )
105
+
106
+ text_input_ids = text_inputs.input_ids
107
+ else:
108
+ if text_input_ids is None:
109
+ raise ValueError("text_input_ids must be provided when the tokenizer is not specified")
110
+
111
+ prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=False)
112
+
113
+ # Use pooled output of CLIPTextModel
114
+ prompt_embeds = prompt_embeds.pooler_output
115
+ prompt_embeds = prompt_embeds.to(dtype=text_encoder.dtype, device=device)
116
+
117
+ # duplicate text embeddings for each generation per prompt, using mps friendly method
118
+ prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
119
+ prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, -1)
120
+
121
+ return prompt_embeds
122
+
123
+
124
+ def compute_text_embeddings(config, prompt, text_encoders, tokenizers, device):
125
+ with torch.no_grad():
126
+ prompt_embeds, pooled_prompt_embeds, text_ids = encode_prompt(
127
+ text_encoders, tokenizers, prompt, config.max_sequence_length
128
+ )
129
+ prompt_embeds = prompt_embeds.to(device)
130
+ pooled_prompt_embeds = pooled_prompt_embeds.to(device)
131
+ text_ids = text_ids.to(device)
132
+ return prompt_embeds, pooled_prompt_embeds, text_ids
133
+
134
+
135
+ def _prepare_image_ids(height, width, offset_h=0, offset_w=0):
136
+ image_ids = torch.zeros(height, width, 3)
137
+ image_ids[..., 1] = image_ids[..., 1] + torch.arange(height)[:, None] + offset_h
138
+ image_ids[..., 2] = image_ids[..., 2] + torch.arange(width)[None, :] + offset_w
139
+ image_ids = image_ids.reshape(-1, 3)
140
+ return image_ids
141
+
142
+
143
+ def _pack_latents(latents, batch_size, num_channels_latents, height, width):
144
+ latents = latents.view(
145
+ batch_size, num_channels_latents, height // 2, 2, width // 2, 2
146
+ )
147
+ latents = latents.permute(0, 2, 4, 1, 3, 5)
148
+ latents = latents.reshape(
149
+ batch_size, (height // 2) * (width // 2), num_channels_latents * 4
150
+ )
151
+
152
+ return latents
153
+
154
+ def _unpack_latents(latents, height, width, vae_downsample_factor):
155
+ batch_size, num_patches, channels = latents.shape
156
+
157
+ # VAE applies 8x compression on images but we must also account for packing which requires
158
+ # latent height and width to be divisible by 2.
159
+ height = 2 * (int(height) // (vae_downsample_factor * 2))
160
+ width = 2 * (int(width) // (vae_downsample_factor * 2))
161
+
162
+ latents = latents.view(batch_size, height // 2, width // 2, channels // 4, 2, 2)
163
+ latents = latents.permute(0, 3, 1, 4, 2, 5)
164
+
165
+ latents = latents.reshape(batch_size, channels // (2 * 2), height, width)
166
+
167
+ return latents
168
+
169
+
170
+ def _prepare_latent_image_ids(batch_size, height, width, device, dtype, offset_h=0, offset_w=0):
171
+ latent_image_ids = torch.zeros(height, width, 3)
172
+ latent_image_ids[..., 1] = (
173
+ latent_image_ids[..., 1] + torch.arange(height)[:, None] + offset_h
174
+ )
175
+ latent_image_ids[..., 2] = (
176
+ latent_image_ids[..., 2] + torch.arange(width)[None, :] + offset_w
177
+ )
178
+
179
+ latent_image_id_height, latent_image_id_width, latent_image_id_channels = (
180
+ latent_image_ids.shape
181
+ )
182
+
183
+ latent_image_ids = latent_image_ids.reshape(
184
+ latent_image_id_height * latent_image_id_width, latent_image_id_channels
185
+ )
186
+
187
+ return latent_image_ids.to(device=device, dtype=dtype)
188
+
189
+
190
+ def pil_to_tensor(image, device="cpu"):
191
+ image = np.array(image)
192
+ image = torch.from_numpy(image).float() / 127.5 - 1.0
193
+ image = image.permute(2, 0, 1).to(device)
194
+ return image
195
+
196
+ @torch.no_grad()
197
+ def encode_images_cond(vae_model, condition_images, device):
198
+ condition_image_tensors = []
199
+ for condition_image in condition_images:
200
+ condition_image_tensor = torch.tensor(np.array(condition_image)).to(device).permute(0, 3, 1, 2) # shape: [n_cond, c, h, w]
201
+ condition_image_tensor = condition_image_tensor / 127.5 - 1.0
202
+ condition_image_tensors.append(condition_image_tensor)
203
+ condition_image_tensors = torch.stack(condition_image_tensors) # shape: [bs, n_cond, c, h, w]
204
+ condition_image_tensors = rearrange(condition_image_tensors, 'b n c h w -> (b n) c h w')
205
+
206
+ # encode condition images
207
+ condition_image_latents = (
208
+ vae_model.encode(
209
+ condition_image_tensors.to(vae_model.dtype)
210
+ ).latent_dist.sample()
211
+ ) # shape: [bs*n_cond, c, h // 8, w // 8]
212
+ condition_image_latents = (condition_image_latents - vae_model.config.shift_factor) * vae_model.config.scaling_factor
213
+
214
+ return condition_image_latents
215
+
216
+
217
+ def prepare_latents(
218
+ batch_size,
219
+ num_channels_latents,
220
+ vae_downsample_factor,
221
+ height,
222
+ width,
223
+ dtype,
224
+ device,
225
+ generator,
226
+ latents=None,
227
+ offset=None,
228
+ hw=False,
229
+ ):
230
+ # VAE applies 8x compression on images but we must also account for packing which requires
231
+ # latent height and width to be divisible by 2.
232
+ height = 2 * (int(height) // (vae_downsample_factor * 2))
233
+ width = 2 * (int(width) // (vae_downsample_factor * 2))
234
+
235
+ shape = (batch_size, num_channels_latents, height, width)
236
+
237
+ if latents is not None:
238
+ if offset is None:
239
+ latent_image_ids = _prepare_latent_image_ids(
240
+ batch_size, height // 2, width // 2, device, dtype
241
+ )
242
+ else:
243
+ latent_image_ids = []
244
+ for offset_ in offset:
245
+ latent_image_ids.append(
246
+ _prepare_latent_image_ids(
247
+ batch_size, height // 2, width // 2, device, dtype, offset_w=offset_ * width // 2, offset_h=offset_ * height // 2 if hw else 0
248
+ )
249
+ )
250
+ return latents.to(device=device, dtype=dtype), latent_image_ids
251
+
252
+ if isinstance(generator, list) and len(generator) != batch_size:
253
+ raise ValueError(
254
+ f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
255
+ f" size of {batch_size}. Make sure the batch size matches the length of the generators."
256
+ )
257
+
258
+ latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
259
+ latents = _pack_latents(
260
+ latents, batch_size, num_channels_latents, height, width
261
+ )
262
+ if offset is None:
263
+ latent_image_ids = _prepare_latent_image_ids(
264
+ batch_size, height // 2, width // 2, device, dtype
265
+ )
266
+ else:
267
+ latent_image_ids = []
268
+ for offset_ in offset:
269
+ latent_image_ids.append(
270
+ _prepare_latent_image_ids(
271
+ batch_size, height // 2, width // 2, device, dtype, offset_w=offset_ * width // 2, offset_h=offset_ * height // 2 if hw else 0
272
+ )
273
+ )
274
+ return latents, latent_image_ids
275
+
276
+
277
+ @torch.no_grad()
278
+ def encode_prompt(
279
+ text_encoders,
280
+ tokenizers,
281
+ prompt: str,
282
+ max_sequence_length,
283
+ device=None,
284
+ num_images_per_prompt: int = 1,
285
+ text_input_ids_list=None,
286
+ ):
287
+ prompt = [prompt] if isinstance(prompt, str) else prompt
288
+ dtype = text_encoders[0].dtype
289
+
290
+ pooled_prompt_embeds = _encode_prompt_with_clip(
291
+ text_encoder=text_encoders[0],
292
+ tokenizer=tokenizers[0],
293
+ prompt=prompt,
294
+ device=device if device is not None else text_encoders[0].device,
295
+ num_images_per_prompt=num_images_per_prompt,
296
+ text_input_ids=text_input_ids_list[0] if text_input_ids_list else None,
297
+ )
298
+
299
+ prompt_embeds = _encode_prompt_with_t5(
300
+ text_encoder=text_encoders[1],
301
+ tokenizer=tokenizers[1],
302
+ max_sequence_length=max_sequence_length,
303
+ prompt=prompt,
304
+ num_images_per_prompt=num_images_per_prompt,
305
+ device=device if device is not None else text_encoders[1].device,
306
+ text_input_ids=text_input_ids_list[1] if text_input_ids_list else None,
307
+ )
308
+
309
+ text_ids = torch.zeros(prompt_embeds.shape[1], 3).to(device=device, dtype=dtype)
310
+
311
+ return prompt_embeds, pooled_prompt_embeds, text_ids
312
+
313
+ def warp_affine_tensor(input_tensor, mask_affines, output_size, scale_factor=1/16,
314
+ align_corners_grid=False, align_corners_sample=True,
315
+ flatten_output=True, device=None):
316
+ """
317
+ 对输入的 tensor 应用 affine 仿射变换,并返回 warp 后的结果。
318
+
319
+ 参数:
320
+ input_tensor: 待变换的图像 tensor,支持的形状包括 (H, W, C)、(C, H, W) 或 (1, C, H, W)。
321
+ mask_affines: 仿射参数(例如 [a, 0, tₓ, 0, e, t_y]),这些参数单位基于 512×512 图像。
322
+ output_size: 目标输出的空间尺寸,格式为 (H_out, W_out)。
323
+ scale_factor: 平移参数的缩放因子;例如若 512→32,则 factor = 32/512 = 1/16。
324
+ align_corners_grid: 传递给 F.affine_grid 的 align_corners 参数。
325
+ align_corners_sample: 传递给 F.grid_sample 的 align_corners 参数。
326
+ flatten_output: 若为 True,则将输出 warp 后的 tensor 从 (1, C, H_out, W_out) 转换为 (-1, C)。
327
+ device: 如果设置,将将相关 tensor 移动到指定的设备上。
328
+
329
+ 返回:
330
+ warped_output: 经过 affine warp 处理后的 tensor,
331
+ 若 flatten_output 为 True,则形状为 (H_out*W_out, C),否则为 (1, C, H_out, W_out)。
332
+ """
333
+ # 如果输入 tensor 不是 batch(4D)的,则调整为 (1, C, H, W)
334
+ if input_tensor.dim() == 3:
335
+ # 判断是否为 (H, W, C),如果最后一维为 3,则认为是 RGB
336
+ if input_tensor.shape[-1] == 3:
337
+ input_tensor = input_tensor.permute(2, 0, 1)
338
+ input_tensor = input_tensor.unsqueeze(0)
339
+ elif input_tensor.dim() != 4:
340
+ raise ValueError("input_tensor 必须是 3D 或 4D Tensor!")
341
+
342
+ # 输出尺寸
343
+ H_out, W_out = output_size
344
+ B, C, H_in, W_in = input_tensor.shape
345
+
346
+ # 将 mask_affines 转换为 tensor,确保形状为 (1, 6)
347
+ if not torch.is_tensor(mask_affines):
348
+ theta = torch.tensor(mask_affines, dtype=torch.float32).unsqueeze(0)
349
+ else:
350
+ theta = mask_affines.clone().float()
351
+ if theta.dim() == 1:
352
+ theta = theta.unsqueeze(0)
353
+
354
+ # 调整平移部分(第三和第六个元素),使其适应当前目标分辨率
355
+ theta[0, 2] *= scale_factor # x 方向平移
356
+ theta[0, 5] *= scale_factor # y 方向平移
357
+
358
+ a = theta[0, 0]
359
+ t_x = theta[0, 2]
360
+ e = theta[0, 4]
361
+ t_y = theta[0, 5]
362
+
363
+ # 根据归一化转换(范围 [-1, 1])
364
+ # 对 x 方向:归一化公式为 x_norm = 2*x/(W_out-1) - 1
365
+ # 转换后 affine 的常数项即为:a + 2*t_x/(W_out-1) - 1
366
+ theta_norm = torch.tensor([
367
+ [a, 0.0, a + 2*t_x/(W_out - 1) - 1],
368
+ [0.0, e, e + 2*t_y/(H_out - 1) - 1]
369
+ ], dtype=torch.float32).unsqueeze(0)
370
+
371
+ # 根据目标输出大小创建 affine_grid,grid 的 size 为 (B, C, H_out, W_out)
372
+ grid = F.affine_grid(theta_norm, size=(B, C, H_out, W_out), align_corners=align_corners_grid)
373
+ if device is not None:
374
+ grid = grid.to(device)
375
+ input_tensor = input_tensor.to(device)
376
+
377
+ # 对输入 tensor 进行采样
378
+ warped = F.grid_sample(input_tensor, grid, align_corners=align_corners_sample)
379
+
380
+ # 若需要将输出展平为 (-1, C)
381
+ if flatten_output:
382
+ # 将 (1, C, H_out, W_out) → 转为 (H_out, W_out, C) → reshape(-1, C)
383
+ warped = warped.squeeze(0).permute(1, 2, 0).reshape(-1, C)
384
+ return warped
385
+
386
+
dreamfuse_inference.py ADDED
@@ -0,0 +1,642 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gc
2
+ import os
3
+ from typing import List
4
+ import contextlib
5
+ import torch.multiprocessing as mp
6
+ from dataclasses import dataclass, field
7
+ from collections import defaultdict
8
+ import random
9
+ import numpy as np
10
+ from PIL import Image, ImageOps
11
+ import json
12
+ import torch
13
+ from peft import PeftModel
14
+ import torch.nn.functional as F
15
+ import accelerate
16
+ import diffusers
17
+ from diffusers import FluxPipeline
18
+ from diffusers.utils.torch_utils import is_compiled_module
19
+ import transformers
20
+ from tqdm import tqdm
21
+ from peft import LoraConfig, set_peft_model_state_dict
22
+ from peft.utils import get_peft_model_state_dict
23
+ from dreamfuse.models.dreamfuse_flux.transformer import (
24
+ FluxTransformer2DModel,
25
+ FluxTransformerBlock,
26
+ FluxSingleTransformerBlock,
27
+ )
28
+ from diffusers.schedulers.scheduling_flow_match_euler_discrete import (
29
+ FlowMatchEulerDiscreteScheduler,
30
+ )
31
+ from diffusers.pipelines.flux.pipeline_flux import calculate_shift, retrieve_timesteps
32
+ from dreamfuse.trains.utils.inference_utils import (
33
+ compute_text_embeddings,
34
+ prepare_latents,
35
+ _unpack_latents,
36
+ _pack_latents,
37
+ _prepare_image_ids,
38
+ encode_images_cond,
39
+ get_mask_affine,
40
+ warp_affine_tensor
41
+ )
42
+
43
+
44
+ def seed_everything(seed):
45
+ torch.manual_seed(seed)
46
+ torch.cuda.manual_seed(seed)
47
+ random.seed(seed)
48
+ np.random.seed(seed)
49
+
50
+ @dataclass
51
+ class InferenceConfig:
52
+ # Model paths
53
+ flux_model_id: str = 'black-forest-labs/FLUX.1-dev'
54
+
55
+ lora_id: str = ''
56
+ model_choice: str = 'dev'
57
+ # Model configs
58
+ lora_rank: int = 16
59
+ max_sequence_length: int = 256
60
+ guidance_scale: float = 3.5
61
+ num_inference_steps: int = 28
62
+ mask_ids: int = 16
63
+ mask_in_chans: int = 128
64
+ mask_out_chans: int = 3072
65
+ inference_scale = 1024
66
+
67
+ # Training configs
68
+ gradient_checkpointing: bool = False
69
+ mix_attention_double: bool = True
70
+ mix_attention_single: bool = True
71
+
72
+ # Image processing
73
+ image_ids_offset: List[int] = field(default_factory=lambda: [0, 0, 0])
74
+ image_tags: List[int] = field(default_factory=lambda: [0, 1, 2])
75
+ context_tags: List[int] = None
76
+
77
+ # Runtime configs
78
+ device: str = "cuda:0" # if torch.cuda.is_available() else "cpu"
79
+ dtype: torch.dtype = torch.bfloat16
80
+ seed: int = 1234
81
+ debug: bool = True
82
+
83
+ # I/O configs
84
+ valid_output_dir: str = "./inference_output"
85
+ valid_roots: List[str] = field(default_factory=lambda: [
86
+ "./",
87
+ ])
88
+ valid_jsons: List[str] = field(default_factory=lambda: [
89
+ "./examples/data_dreamfuse.json",
90
+ ])
91
+ ref_prompts: str = ""
92
+
93
+ truecfg: bool = False
94
+ text_strength: int = 5
95
+
96
+ # multi gpu
97
+ sub_idx:int = 0
98
+ total_num:int = 1
99
+
100
+ def adjust_fg_to_bg(image: Image.Image, mask: Image.Image, target_size: tuple) -> tuple[Image.Image, Image.Image]:
101
+ width, height = image.size
102
+ target_w, target_h = target_size
103
+
104
+ scale = min(target_w / width, target_h / height)
105
+ if scale < 1:
106
+ new_w = int(width * scale)
107
+ new_h = int(height * scale)
108
+ image = image.resize((new_w, new_h))
109
+ mask = mask.resize((new_w, new_h))
110
+ width, height = new_w, new_h
111
+
112
+ pad_w = target_w - width
113
+ pad_h = target_h - height
114
+ padding = (
115
+ pad_w // 2, # left
116
+ pad_h // 2, # top
117
+ (pad_w + 1) // 2, # right
118
+ (pad_h + 1) // 2 # bottom
119
+ )
120
+
121
+ image = ImageOps.expand(image, border=padding, fill=(255, 255, 255))
122
+ mask = ImageOps.expand(mask, border=padding, fill=0)
123
+
124
+ return image, mask
125
+
126
+ def find_nearest_bucket_size(input_width, input_height, mode="x64", bucket_size=1024):
127
+ """
128
+ Finds the nearest bucket size for the given input size.
129
+ """
130
+ buckets = {
131
+ 512: [[ 256, 768 ], [ 320, 768 ], [ 320, 704 ], [ 384, 640 ], [ 448, 576 ], [ 512, 512 ], [ 576, 448 ], [ 640, 384 ], [ 704, 320 ], [ 768, 320 ], [ 768, 256 ]],
132
+ 768: [[ 384, 1152 ], [ 480, 1152 ], [ 480, 1056 ], [ 576, 960 ], [ 672, 864 ], [ 768, 768 ], [ 864, 672 ], [ 960, 576 ], [ 1056, 480 ], [ 1152, 480 ], [ 1152, 384 ]],
133
+ 1024: [[ 512, 1536 ], [ 640, 1536 ], [ 640, 1408 ], [ 768, 1280 ], [ 896, 1152 ], [ 1024, 1024 ], [ 1152, 896 ], [ 1280, 768 ], [ 1408, 640 ], [ 1536, 640 ], [ 1536, 512 ]]
134
+ }
135
+
136
+ buckets = buckets[bucket_size]
137
+
138
+ aspect_ratios = [w / h for (w, h) in buckets]
139
+ assert mode in ["x64", "x8"]
140
+ if mode == "x64":
141
+ asp = input_width / input_height
142
+ diff = [abs(ar - asp) for ar in aspect_ratios]
143
+ bucket_id = int(np.argmin(diff))
144
+ gen_width, gen_height = buckets[bucket_id]
145
+ elif mode == "x8":
146
+ max_pixels = 1024 * 1024
147
+ ratio = (max_pixels / (input_width * input_height)) ** (0.5)
148
+ gen_width, gen_height = round(input_width * ratio), round(input_height * ratio)
149
+ gen_width = gen_width - gen_width % 8
150
+ gen_height = gen_height - gen_height % 8
151
+ else:
152
+ raise NotImplementedError
153
+ return (gen_width, gen_height)
154
+
155
+ def make_image_grid(images, rows, cols, size=None):
156
+ assert len(images) == rows * cols
157
+
158
+ if size is not None:
159
+ images = [img.resize((size[0], size[1])) for img in images]
160
+
161
+ w, h = images[0].size
162
+ grid = Image.new("RGB", size=(cols * w, rows * h))
163
+
164
+ for i, img in enumerate(images):
165
+ grid.paste(img.convert("RGB"), box=(i % cols * w, i // cols * h))
166
+ return grid
167
+
168
+ class DreamFuseInference:
169
+ def __init__(self, config: InferenceConfig):
170
+ self.config = config
171
+ print(config.device)
172
+ self.device = torch.device(config.device)
173
+ torch.backends.cuda.matmul.allow_tf32 = True
174
+ seed_everything(config.seed)
175
+ self._init_models()
176
+
177
+ def _init_models(self):
178
+ # Initialize tokenizers
179
+ self.tokenizer_one = transformers.CLIPTokenizer.from_pretrained(
180
+ self.config.flux_model_id, subfolder="tokenizer"
181
+ )
182
+ self.tokenizer_two = transformers.T5TokenizerFast.from_pretrained(
183
+ self.config.flux_model_id, subfolder="tokenizer_2"
184
+ )
185
+
186
+ # Initialize text encoders
187
+ self.text_encoder_one = transformers.CLIPTextModel.from_pretrained(
188
+ self.config.flux_model_id, subfolder="text_encoder"
189
+ ).to(device=self.device, dtype=self.config.dtype)
190
+ self.text_encoder_two = transformers.T5EncoderModel.from_pretrained(
191
+ self.config.flux_model_id, subfolder="text_encoder_2"
192
+ ).to(device=self.device, dtype=self.config.dtype)
193
+
194
+ # Initialize VAE
195
+ self.vae = diffusers.AutoencoderKL.from_pretrained(
196
+ self.config.flux_model_id, subfolder="vae"
197
+ ).to(device=self.device, dtype=self.config.dtype)
198
+
199
+ # Initialize denoising model
200
+ self.denoise_model = FluxTransformer2DModel.from_pretrained(
201
+ self.config.flux_model_id, subfolder="transformer"
202
+ ).to(device=self.device, dtype=self.config.dtype)
203
+
204
+ if self.config.image_tags is not None or self.config.context_tags is not None:
205
+ num_image_tag_embeddings = max(self.config.image_tags) + 1 if self.config.image_tags is not None else 0
206
+ num_context_tag_embeddings = max(self.config.context_tags) + 1 if self.config.context_tags is not None else 0
207
+ self.denoise_model.set_tag_embeddings(
208
+ num_image_tag_embeddings=num_image_tag_embeddings,
209
+ num_context_tag_embeddings=num_context_tag_embeddings,
210
+ )
211
+
212
+ # Add LoRA
213
+ self.denoise_model = PeftModel.from_pretrained(
214
+ self.denoise_model,
215
+ self.config.lora_id,
216
+ adapter_weights=[1.0],
217
+ device_map={"": self.device}
218
+ )
219
+
220
+ # Initialize scheduler
221
+ self.scheduler = FlowMatchEulerDiscreteScheduler.from_pretrained(
222
+ self.config.flux_model_id, subfolder="scheduler"
223
+ )
224
+
225
+ # Set models to eval mode
226
+ for model in [self.text_encoder_one, self.text_encoder_two, self.vae, self.denoise_model]:
227
+ model.eval()
228
+ model.requires_grad_(False)
229
+
230
+ def _compute_text_embeddings(self, prompt):
231
+ return compute_text_embeddings(
232
+ self.config,
233
+ prompt,
234
+ [self.text_encoder_one, self.text_encoder_two],
235
+ [self.tokenizer_one, self.tokenizer_two],
236
+ self.device
237
+ )
238
+
239
+ def resize_to_fit_within(self, reference_image, target_image):
240
+ ref_width, ref_height = reference_image.size
241
+ target_width, target_height = target_image.size
242
+
243
+ scale_width = ref_width / target_width
244
+ scale_height = ref_height / target_height
245
+ scale = min(scale_width, scale_height) # 选择最小的缩放比例,确保不超出参考图片的宽高
246
+
247
+ new_width = int(target_width * scale)
248
+ new_height = int(target_height * scale)
249
+
250
+ resized_image = target_image.resize((new_width, new_height), Image.LANCZOS)
251
+ return resized_image
252
+
253
+ def pad_or_crop(self, img, target_size, fill_color=(255, 255, 255)):
254
+ """
255
+ 将输入图像按中心对齐,裁剪或填充到 target_size 大小。
256
+
257
+ 参数:
258
+ img - PIL.Image 对象
259
+ target_size - 目标尺寸 (width, height)
260
+ fill_color - 填充颜色,默认为白色
261
+
262
+ 返回:
263
+ 调整后的 PIL.Image 对象,尺寸为 target_size
264
+ """
265
+ iw, ih = img.size
266
+ tw, th = target_size
267
+
268
+ # 计算裁剪区域:若原图大于目标尺寸,则裁剪出中间部分;否则全部保留
269
+ left = (iw - tw) // 2 if iw >= tw else 0
270
+ top = (ih - th) // 2 if ih >= th else 0
271
+ cropped = img.crop((left, top, left + min(iw, tw), top + min(ih, th)))
272
+
273
+ # 新建目标尺寸的图像,并将裁剪后的图像居中粘贴
274
+ new_img = Image.new(img.mode, target_size, fill_color)
275
+ offset = ((tw - cropped.width) // 2, (th - cropped.height) // 2)
276
+ new_img.paste(cropped, offset)
277
+
278
+ return new_img
279
+
280
+ def transform_foreground_original(self, original_fg, original_bg, transformation_info, canvas_size=400):
281
+ """
282
+ 根据 transformation_info 中的信息对原始前景图(original_fg)进行平移处理,
283
+ 要求:
284
+ 1. 输出图像大小与 original_fg 相同(保持原始前景图大小);
285
+ 2. 位移计算时,还原为未缩放的拖拽坐标,即用 drag_left/drag_top 除以 scale_ratio;
286
+ 3. 拖拽产生的相对位移比例在 400x400 预览画布下相对于未缩放时默认(居中)位置计算,
287
+ 然后按此比例推算到原始前景图尺寸下的实际位移(像素数)。
288
+ 4. 结果在原始前景图大小的白底(未覆盖区域填充白色)中粘贴前景图。
289
+
290
+ 参数:
291
+ original_fg: 原始上传的前景图(PIL Image 对象)
292
+ transformation_info: 字典,必须包含以下字段:
293
+ - "drag_left": 拖拽后当前显示的前景图左上角横坐标(受缩放影响,单位像素)
294
+ - "drag_top": 拖拽后当前显示的前景图左上角纵坐标(受缩放影响,单位像素)
295
+ - "scale_ratio": 预览时前景图缩放比例
296
+ - "data_original_width": 前景图在预览中未缩放时的宽度
297
+ - "data_original_height": 前景图在预览中未缩放时的高度
298
+ canvas_size: 预览画布尺寸(默认400,与前端保持一致)
299
+
300
+ 返回:
301
+ 处理后的图像(PIL Image 对象),大小与 original_fg 相同,
302
+ 并根据未缩放时拖拽的相对位移结果进行了平移。
303
+ """
304
+ # 读取 transformation_info 中的参数
305
+ drag_left = float(transformation_info.get("drag_left", 0))
306
+ drag_top = float(transformation_info.get("drag_top", 0))
307
+ scale_ratio = float(transformation_info.get("scale_ratio", 1))
308
+ data_orig_width = float(transformation_info.get("data_original_width", canvas_size))
309
+ data_orig_height = float(transformation_info.get("data_original_height", canvas_size))
310
+ drag_width = float(transformation_info.get("drag_width", 0))
311
+ drag_height = float(transformation_info.get("drag_height", 0))
312
+
313
+
314
+ scale_ori_fg = canvas_size / max(original_fg.width, original_fg.height)
315
+ scale_ori_bg = canvas_size / max(original_bg.width, original_bg.height)
316
+
317
+ # 计算未缩放状态下(预览中)的默认居中位置(前景图未拖拽时的理想位置)
318
+ default_left = (canvas_size - data_orig_width) / 2.0
319
+ default_top = (canvas_size - data_orig_height) / 2.0
320
+
321
+ # 在未缩放状态下,计算实际拖拽产生的偏移(单位:像素,在预览尺寸下计算)
322
+ offset_preview_x = drag_left - default_left
323
+ offset_preview_y = drag_top - default_top
324
+
325
+ offset_ori_x = offset_preview_x / scale_ori_fg
326
+ offset_ori_y = offset_preview_y / scale_ori_fg
327
+
328
+ new_width = int(original_fg.width * scale_ratio)
329
+ new_height = int(original_fg.height * scale_ratio)
330
+ scale_fg = original_fg.resize((new_width, new_height))
331
+
332
+ output = Image.new("RGBA", (original_fg.width, original_fg.height), (255, 255, 255, 0))
333
+ output.paste(scale_fg, (int(offset_ori_x), int(offset_ori_y)))
334
+
335
+ new_width_fgbg = original_fg.width * scale_ori_fg / scale_ori_bg
336
+ new_height_fgbg = original_fg.height * scale_ori_fg / scale_ori_bg
337
+ scale_fgbg = output.resize((int(new_width_fgbg), int(new_height_fgbg)))
338
+
339
+
340
+ final_output = Image.new("RGBA", (original_bg.width, original_bg.height), (255, 255, 255, 0))
341
+ scale_fgbg = self.pad_or_crop(scale_fgbg, (original_bg.width, original_bg.height), (255, 255, 255, 0))
342
+ final_output.paste(scale_fgbg, (0, 0))
343
+
344
+ fit_fg = self.resize_to_fit_within(original_bg, original_fg)
345
+ fit_fg = self.pad_or_crop(fit_fg, original_bg.size, (255, 255, 255, 0))
346
+
347
+ return final_output, fit_fg
348
+
349
+ @torch.inference_mode()
350
+ def gradio_generate(self, background_img, foreground_img, transformation_info, seed, prompt, enable_gui, cfg=3.5, size_select="1024", text_strength=1, truecfg=False):
351
+ print("!"*10)
352
+ """使用 DreamFuseInference 进行模型推理"""
353
+ try:
354
+ trans = json.loads(transformation_info)
355
+ except:
356
+ trans = {}
357
+
358
+ size_select = int(size_select)
359
+
360
+ # import pdb; pdb.set_trace()
361
+ r, g, b, ori_a = foreground_img.split()
362
+ fg_img_scale, fg_img = self.transform_foreground_original(foreground_img, background_img, trans)
363
+
364
+ new_r, new_g, new_b, new_a = fg_img_scale.split()
365
+ foreground_img_scale = Image.merge("RGB", (new_r, new_g, new_b))
366
+
367
+ r, g, b, ori_a = fg_img.split()
368
+ foreground_img = Image.merge("RGB", (r, g, b))
369
+ foreground_img_save = foreground_img.copy()
370
+ ori_a = ori_a.convert("L")
371
+ new_a = new_a.convert("L")
372
+ foreground_img.paste((255, 255, 255), mask=ImageOps.invert(ori_a))
373
+ print("0"*10)
374
+ print(foreground_img.size)
375
+ print(background_img.size)
376
+ images = self.model_generate(foreground_img.copy(), background_img.copy(),
377
+ ori_a, new_a,
378
+ enable_mask_affine=enable_gui,
379
+ prompt=prompt,
380
+ offset_cond=[0, 1, 0] if not enable_gui else None,
381
+ seed=seed,
382
+ cfg=cfg,
383
+ size_select=size_select,
384
+ text_strength=text_strength,
385
+ truecfg=truecfg)
386
+ images = Image.fromarray(images[0], "RGB")
387
+
388
+ images = images.resize(background_img.size)
389
+ images_save = images.copy()
390
+
391
+ images.thumbnail((640, 640), Image.LANCZOS)
392
+ return images
393
+
394
+
395
+ @torch.inference_mode()
396
+ def model_generate(self, fg_image, bg_image, ori_fg_mask, new_fg_mask, enable_mask_affine=True, prompt="", offset_cond=None, seed=None, cfg=3.5, size_select=1024, text_strength=1, truecfg=False):
397
+ batch_size = 1
398
+ print("-3"*10)
399
+ # Prepare images
400
+ # adjust bg->fg size
401
+ fg_image, ori_fg_mask = adjust_fg_to_bg(fg_image, ori_fg_mask, bg_image.size)
402
+ bucket_size = find_nearest_bucket_size(bg_image.size[0], bg_image.size[1], bucket_size=size_select)
403
+
404
+ fg_image = fg_image.resize(bucket_size)
405
+ bg_image = bg_image.resize(bucket_size)
406
+
407
+ mask_affine = None
408
+ if enable_mask_affine:
409
+ ori_fg_mask = ori_fg_mask.resize(bucket_size)
410
+ new_fg_mask = new_fg_mask.resize(bucket_size)
411
+ mask_affine = get_mask_affine(new_fg_mask, ori_fg_mask)
412
+
413
+ print("-2"*10)
414
+ # Get embeddings
415
+ prompt_embeds, pooled_prompt_embeds, text_ids = self._compute_text_embeddings(prompt)
416
+
417
+ prompt_embeds = prompt_embeds.repeat(1, text_strength, 1)
418
+ text_ids = text_ids.repeat(text_strength, 1)
419
+
420
+ # Prepare
421
+ if self.config.model_choice == "dev":
422
+ guidance = torch.full([1], cfg, device=self.device, dtype=torch.float32)
423
+ guidance = guidance.expand(batch_size)
424
+ else:
425
+ guidance = None
426
+
427
+ # Prepare generator
428
+ if seed is None:
429
+ seed = self.config.seed
430
+ generator = torch.Generator(device=self.device).manual_seed(seed)
431
+ print("-1"*10)
432
+ # Prepare condition latents
433
+ condition_image_latents = self._encode_images([fg_image, bg_image])
434
+
435
+ if offset_cond is None:
436
+ offset_cond = self.config.image_ids_offset
437
+ offset_cond = offset_cond[1:]
438
+ cond_latent_image_ids = []
439
+ for offset_ in offset_cond:
440
+ cond_latent_image_ids.append(
441
+ self._prepare_image_ids(
442
+ condition_image_latents.shape[2] // 2,
443
+ condition_image_latents.shape[3] // 2,
444
+ offset_w=offset_ * condition_image_latents.shape[3] // 2
445
+ )
446
+ )
447
+
448
+ print(1)
449
+ if mask_affine is not None:
450
+ affine_H, affine_W = condition_image_latents.shape[2] // 2, condition_image_latents.shape[3] // 2
451
+ scale_factor = 1 / 16
452
+ cond_latent_image_ids_fg = cond_latent_image_ids[0].reshape(affine_H, affine_W, 3).clone()
453
+
454
+ # opt 1
455
+ cond_latent_image_ids[0] = warp_affine_tensor(
456
+ cond_latent_image_ids_fg, mask_affine, output_size=(affine_H, affine_W),
457
+ scale_factor=scale_factor, device=self.device,
458
+ )
459
+ cond_latent_image_ids = torch.stack(cond_latent_image_ids)
460
+ print(2)
461
+ # Pack condition latents
462
+ cond_image_latents = self._pack_latents(condition_image_latents)
463
+ cond_input = {
464
+ "image_latents": cond_image_latents,
465
+ "image_ids": cond_latent_image_ids,
466
+ }
467
+ # Prepare initial latents
468
+ width, height = bucket_size
469
+ num_channels_latents = self.denoise_model.config.in_channels // 4
470
+ latents, latent_image_ids = self._prepare_latents(
471
+ batch_size, num_channels_latents, height, width, generator
472
+ )
473
+ print(3)
474
+ # Setup timesteps
475
+ sigmas = np.linspace(1.0, 1 / self.config.num_inference_steps, self.config.num_inference_steps)
476
+ image_seq_len = latents.shape[1]
477
+ mu = calculate_shift(
478
+ image_seq_len,
479
+ self.scheduler.config.base_image_seq_len,
480
+ self.scheduler.config.max_image_seq_len,
481
+ self.scheduler.config.base_shift,
482
+ self.scheduler.config.max_shift,
483
+ )
484
+ timesteps, num_inference_steps = retrieve_timesteps(
485
+ self.scheduler,
486
+ self.config.num_inference_steps,
487
+ self.device,
488
+ sigmas=sigmas,
489
+ mu=mu,
490
+ )
491
+ print(4)
492
+ # Denoising loop
493
+ for i, t in enumerate(timesteps):
494
+ timestep = t.expand(latents.shape[0]).to(latents.dtype)
495
+ with torch.autocast(enabled=True, device_type="cuda", dtype=self.config.dtype):
496
+ noise_pred = self.denoise_model(
497
+ hidden_states=latents,
498
+ cond_input=cond_input,
499
+ timestep=timestep / 1000,
500
+ guidance=guidance,
501
+ pooled_projections=pooled_prompt_embeds,
502
+ encoder_hidden_states=prompt_embeds,
503
+ txt_ids=text_ids,
504
+ img_ids=latent_image_ids,
505
+ data_num_per_group=batch_size,
506
+ image_tags=self.config.image_tags,
507
+ context_tags=self.config.context_tags,
508
+ max_sequence_length=self.config.max_sequence_length,
509
+ mix_attention_double=self.config.mix_attention_double,
510
+ mix_attention_single=self.config.mix_attention_single,
511
+ joint_attention_kwargs=None,
512
+ return_dict=False,
513
+ )[0]
514
+
515
+ if truecfg and i >= 1:
516
+ guidance_neg = torch.full([1], 1, device=self.device, dtype=torch.float32)
517
+ guidance_neg = guidance_neg.expand(batch_size)
518
+ noise_pred_neg = self.denoise_model(
519
+ hidden_states=latents,
520
+ cond_input=cond_input,
521
+ timestep=timestep / 1000,
522
+ guidance=guidance,
523
+ pooled_projections=pooled_prompt_embeds,
524
+ encoder_hidden_states=prompt_embeds,
525
+ txt_ids=text_ids,
526
+ img_ids=latent_image_ids,
527
+ data_num_per_group=batch_size,
528
+ image_tags=self.config.image_tags,
529
+ context_tags=self.config.context_tags,
530
+ max_sequence_length=self.config.max_sequence_length,
531
+ mix_attention_double=self.config.mix_attention_double,
532
+ mix_attention_single=self.config.mix_attention_single,
533
+ joint_attention_kwargs=None,
534
+ return_dict=False,
535
+ )[0]
536
+ noise_pred = noise_pred_neg + 5 * (noise_pred - noise_pred_neg)
537
+
538
+ # Compute previous noisy sample
539
+ latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
540
+ print(5)
541
+ # Decode latents
542
+ latents = self._unpack_latents(latents, height, width)
543
+ latents = (latents / self.vae.config.scaling_factor) + self.vae.config.shift_factor
544
+ images = self.vae.decode(latents, return_dict=False)[0]
545
+ print(6)
546
+ # Post-process images
547
+ images = images.add(1).mul(127.5).clamp(0, 255).to(torch.uint8).permute(0, 2, 3, 1).cpu().numpy()
548
+ return images
549
+
550
+ def _encode_images(self, images):
551
+ return encode_images_cond(self.vae, [images], self.device)
552
+
553
+ def _prepare_image_ids(self, h, w, offset_w=0):
554
+ return _prepare_image_ids(h, w, offset_w=offset_w).to(self.device)
555
+
556
+ def _pack_latents(self, latents):
557
+ b, c, h, w = latents.shape
558
+ return _pack_latents(latents, b, c, h, w)
559
+
560
+ def _unpack_latents(self, latents, height, width):
561
+ vae_scale = 2 ** (len(self.vae.config.block_out_channels) - 1)
562
+ return _unpack_latents(latents, height, width, vae_scale)
563
+
564
+ def _prepare_latents(self, batch_size, num_channels_latents, height, width, generator):
565
+ vae_scale = 2 ** (len(self.vae.config.block_out_channels) - 1)
566
+ latents, latent_image_ids = prepare_latents(
567
+ batch_size=batch_size,
568
+ num_channels_latents=num_channels_latents,
569
+ vae_downsample_factor=vae_scale,
570
+ height=height,
571
+ width=width,
572
+ dtype=self.config.dtype,
573
+ device=self.device,
574
+ generator=generator,
575
+ offset=None
576
+ )
577
+ return latents, latent_image_ids
578
+
579
+ def main():
580
+ parser = transformers.HfArgumentParser(InferenceConfig)
581
+ config: InferenceConfig = parser.parse_args_into_dataclasses()[0]
582
+ model = DreamFuseInference(config)
583
+ os.makedirs(config.valid_output_dir, exist_ok=True)
584
+ for valid_root, valid_json in zip(config.valid_roots, config.valid_jsons):
585
+ with open(valid_json, 'r') as f:
586
+ valid_info = json.load(f)
587
+
588
+ # multi gpu
589
+ to_process = sorted(list(valid_info.keys()))
590
+
591
+ # debug
592
+ to_process = [k for k in to_process if "data_wear" in k and "pixelwave" in k]
593
+ # debug
594
+
595
+ sd_idx = len(to_process) // config.total_num * config.sub_idx
596
+ ed_idx = len(to_process) // config.total_num * (config.sub_idx + 1)
597
+ if config.sub_idx < config.total_num - 1:
598
+ print(config.sub_idx, sd_idx, ed_idx)
599
+ to_process = to_process[sd_idx:ed_idx]
600
+ else:
601
+ print(config.sub_idx, sd_idx)
602
+ to_process = to_process[sd_idx:]
603
+ valid_info = {k: valid_info[k] for k in to_process}
604
+
605
+ for meta_key, info in tqdm(valid_info.items()):
606
+ img_name = meta_key.split('/')[-1]
607
+
608
+ foreground_img = Image.open(os.path.join(valid_root, info['img_info']['000']))
609
+ background_img = Image.open(os.path.join(valid_root, info['img_info']['001']))
610
+
611
+ new_fg_mask = Image.open(os.path.join(valid_root, info['img_mask_info']['000_mask_scale']))
612
+ ori_fg_mask = Image.open(os.path.join(valid_root, info['img_mask_info']['000']))
613
+
614
+ # debug
615
+ foreground_img.save(os.path.join(config.valid_output_dir, f"{img_name}_0.png"))
616
+ background_img.save(os.path.join(config.valid_output_dir, f"{img_name}_1.png"))
617
+ ori_fg_mask.save(os.path.join(config.valid_output_dir, f"{img_name}_0_mask.png"))
618
+ new_fg_mask.save(os.path.join(config.valid_output_dir, f"{img_name}_0_mask_scale.png"))
619
+ # debug
620
+
621
+ foreground_img.paste((255, 255, 255), mask=ImageOps.invert(ori_fg_mask))
622
+
623
+ images = model(foreground_img.copy(), background_img.copy(),
624
+ ori_fg_mask, new_fg_mask,
625
+ prompt=config.ref_prompts,
626
+ seed=config.seed,
627
+ cfg=config.guidance_scale,
628
+ size_select=config.inference_scale,
629
+ text_strength=config.text_strength,
630
+ truecfg=config.truecfg)
631
+
632
+ result_image = Image.fromarray(images[0], "RGB")
633
+ result_image = result_image.resize(background_img.size)
634
+ result_image.save(os.path.join(config.valid_output_dir, f"{img_name}_2.png"))
635
+ # Make grid
636
+ grid_image = [foreground_img, background_img] + [result_image]
637
+ result = make_image_grid(grid_image, 1, len(grid_image), size=result_image.size)
638
+
639
+ result.save(os.path.join(config.valid_output_dir, f"{img_name}.jpg"))
640
+
641
+ if __name__ == "__main__":
642
+ main()
examples/9_01.png ADDED

Git LFS Details

  • SHA256: 5e41295df53b3ee903a93034a0de0118570ba5f5d7e6dd08439592cc377cf672
  • Pointer size: 131 Bytes
  • Size of remote file: 383 kB
examples/9_02.png ADDED

Git LFS Details

  • SHA256: cdfa95db79005c7015afc9d557c5eebfcb4a83f4e43b9d0439241ea00a8888d8
  • Pointer size: 132 Bytes
  • Size of remote file: 1.87 MB
output_images/no_bg_image.png ADDED

Git LFS Details

  • SHA256: 339a905140e7f12e9443ce8acc5455b89c198a6ddda6e94f322797fedb2c04c8
  • Pointer size: 131 Bytes
  • Size of remote file: 496 kB
requirements.txt ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ lmdb==1.4.1
2
+ tqdm==4.66.1
3
+ datasets
4
+ tensorboardX
5
+ accelerate
6
+ ninja
7
+ transformers==4.46.3
8
+ pycocotools==2.0.7
9
+ scikit-image
10
+ Pillow==9.5.0
11
+ opencv-python
12
+ opencv-python-headless
13
+ datasets
14
+ einops==0.8.0
15
+ sentencepiece
16
+ pydantic==2.9.2
17
+ deepspeed
18
+ peft==0.14.0
19
+ diffusers==0.32.0
20
+ rotary-embedding-torch==0.8.4
21
+ tiktoken==0.8.0
22
+ transformers_stream_generator==0.0.5
23
+ ftfy
24
+ bs4
25
+ bson==0.5.10
26
+ gradio==5.12.0
27
+ httpx
28
+ fairscale==0.4.13
29
+ kornia
30
+ timm==1.0.9
31
+ protobuf==3.20.0
32
+ basicsr
33
+ sentencepiece
34
+ huggingface_hub
35
+ prodigyopt
36
+ torch==2.4.0
37
+ torchvision==0.19.0