File size: 6,129 Bytes
8ae18c8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
730c74d
 
8ae18c8
 
 
 
 
730c74d
8ae18c8
730c74d
8ae18c8
 
730c74d
8ae18c8
 
 
 
730c74d
8ae18c8
 
 
 
 
 
 
2b299d0
8ae18c8
 
 
 
 
9380c96
8ae18c8
 
 
 
 
 
730c74d
8ae18c8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
306edbc
8ae18c8
 
 
 
1efb400
8ae18c8
 
 
1efb400
8ae18c8
 
 
 
 
 
 
 
 
 
 
730c74d
8ae18c8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
730c74d
8ae18c8
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
# import gradio as gr
# import torch
# import spaces
# from diffusers import FluxPipeline, DiffusionPipeline
# from torchao.quantization import autoquant



# # # # normal FluxPipeline
# pipeline_normal = FluxPipeline.from_pretrained(
#     "sayakpaul/FLUX.1-merged",
#     torch_dtype=torch.bfloat16
# ).to("cuda")
# pipeline_normal.transformer.to(memory_format=torch.channels_last)
# pipeline_normal.transformer = torch.compile(pipeline_normal.transformer, mode="max-autotune", fullgraph=True)


# # # optimized FluxPipeline
# # pipeline_optimized = FluxPipeline.from_pretrained(
# #     "camenduru/FLUX.1-dev-diffusers",
# #     torch_dtype=torch.bfloat16
# # ).to("cuda")
# # pipeline_optimized.transformer.to(memory_format=torch.channels_last)
# # pipeline_optimized.transformer = torch.compile(
# #     pipeline_optimized.transformer,
# #     mode="max-autotune",
# #     fullgraph=True
# # )
# # # wrap the autoquant call in a try-except block to handle unsupported layers
# # for name, layer in pipeline_optimized.transformer.named_children():
# #     try:
# #         # apply autoquant to each layer
# #         pipeline_optimized.transformer._modules[name] = autoquant(layer, error_on_unseen=False)
# #         print(f"Successfully quantized {name}")
# #     except AttributeError as e:
# #         print(f"Skipping layer {name} due to error: {e}")
# #     except Exception as e:
# #         print(f"Unexpected error while quantizing {name}: {e}")

# # pipeline_optimized.transformer = autoquant(
# #     pipeline_optimized.transformer,
# #     error_on_unseen=False
# # )
# pipeline_optimized = pipeline_normal

# @spaces.GPU(duration=120)
# def generate_images(prompt, guidance_scale, num_inference_steps):
#     # # generate image with normal pipeline
#     # image_normal = pipeline_normal(
#     #     prompt=prompt,
#     #     guidance_scale=guidance_scale,
#     #     num_inference_steps=int(num_inference_steps)
#     # ).images[0]
    
#     # generate image with optimized pipeline
#     image_optimized = pipeline_optimized(
#         prompt=prompt,
#         guidance_scale=guidance_scale,
#         num_inference_steps=int(num_inference_steps)
#     ).images[0]
    
#     return image_optimized

# # set up Gradio interface
# demo = gr.Interface(
#     fn=generate_images,
#     inputs=[
#         gr.Textbox(lines=2, placeholder="Enter your prompt here...", label="Prompt"),
#         gr.Slider(1.0, 10.0, step=0.5, value=3.5, label="Guidance Scale"),
#         gr.Slider(10, 100, step=1, value=50, label="Number of Inference Steps")
#     ],
#     outputs=[
#         gr.Image(type="pil", label="Optimized FluxPipeline")
#     ],
#     title="FluxPipeline Comparison",
#     description="Compare images generated by the normal FluxPipeline and the optimized one using torchao and torch.compile()."
# )

# demo.launch()
import gradio as gr
import torch
from optimum.quanto import quantize
from diffusers import FlowMatchEulerDiscreteScheduler, AutoencoderKL
from transformers import CLIPTextModel, CLIPTokenizer, T5TokenizerFast
from diffusers.pipelines.flux.pipeline_flux import FluxPipeline
import subprocess
import spaces
import os

# Set the data type for inference
dtype = torch.bfloat16

# Hugging Face repository and revision settings
repo_name = "FLUX.1-schnell-4bit"
bfl_repo = "black-forest-labs/FLUX.1-schnell"
revision = "refs/pr/1"

# Ensure local directory exists and download model files
subprocess.run(["mkdir", "-p", repo_name])
subprocess.run([
    "huggingface-cli", "download", "PrunaAI/" + repo_name, 
    "--local-dir", repo_name, 
    "--local-dir-use-symlinks", "False"
])

# Load scheduler, tokenizer, and VAE from the pre-trained repo
scheduler = FlowMatchEulerDiscreteScheduler.from_pretrained(bfl_repo, subfolder="scheduler", revision=revision)
text_encoder = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14", torch_dtype=dtype)
tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14", torch_dtype=dtype)
vae = AutoencoderKL.from_pretrained(bfl_repo, subfolder="vae", torch_dtype=dtype, revision=revision)

# Load text_encoder_2 and tokenizer_2 locally
text_encoder_2 = torch.load(repo_name + '/text_encoder_2.pt')
tokenizer_2 = T5TokenizerFast.from_pretrained(bfl_repo, subfolder="tokenizer_2", torch_dtype=dtype, revision=revision)

# Load transformer locally (quantized model)
transformer = torch.load(repo_name + '/transformer.pt')

# Create the pipeline using the pre-trained models
pipe = FluxPipeline(
    scheduler=scheduler,
    text_encoder=text_encoder,
    tokenizer=tokenizer,
    text_encoder_2=text_encoder_2,
    tokenizer_2=tokenizer_2,
    vae=vae,
    transformer=transformer,
)

# Enable model CPU offload to save memory
pipe.enable_model_cpu_offload()

# Define the image generation function
@spaces.GPU(duration=120)
def generate_image(prompt, guidance_scale, num_inference_steps):
    generator = torch.Generator().manual_seed(12345)
    image = pipe(
        prompt,
        guidance_scale=guidance_scale,
        num_inference_steps=int(num_inference_steps),
        max_sequence_length=256,
        generator=generator
    ).images[0]
    return image

# Set up Gradio interface
with gr.Blocks() as demo:
    gr.Markdown("# FLUX.1-schnell 4-bit Quantized Model")

    # Input for text prompt
    prompt_input = gr.Textbox(lines=2, label="Prompt", placeholder="Enter your prompt here...")

    # Slider for guidance scale
    guidance_scale_input = gr.Slider(0.0, 10.0, step=0.1, value=7.5, label="Guidance Scale")

    # Slider for number of inference steps
    inference_steps_input = gr.Slider(4, 50, step=1, value=25, label="Number of Inference Steps")

    # Button to trigger generation
    generate_button = gr.Button("Generate Image")

    # Output image
    output_image = gr.Image(label="Generated Image", type="pil")

    # Connect button to the image generation function
    generate_button.click(fn=generate_image, 
                          inputs=[prompt_input, guidance_scale_input, inference_steps_input], 
                          outputs=[output_image])

# Launch the Gradio app
demo.launch()