himanshu1844 commited on
Commit
1d4e95e
·
1 Parent(s): c221508
Files changed (3) hide show
  1. app.py +3 -1
  2. model.py +3 -165
  3. setup.py +7 -0
app.py CHANGED
@@ -1,6 +1,6 @@
1
  import gradio as gr
2
- from Voxify import VoxifyInfereence
3
  import torchaudio
 
4
  voxify=VoxifyInfereence(name="declare-lab/TangoFlux")
5
  def gradio_generate(prompt, steps, guidance,duration=10):
6
 
@@ -14,6 +14,8 @@ def gradio_generate(prompt, steps, guidance,duration=10):
14
 
15
 
16
  return filename
 
 
17
  description_text = """
18
  * Powered by **Stability AI**
19
  Generate high quality and faithful audio in just a few seconds using <b>VOXIFY</b> by providing a text prompt. <b>VOXIFY</b> was trained from scratch and underwent alignment to follow human instructions using a new method called <b>CLAP-Ranked Preference Optimization (CRPO)</b>.
 
1
  import gradio as gr
 
2
  import torchaudio
3
+ from Voxify import VoxifyInfereence
4
  voxify=VoxifyInfereence(name="declare-lab/TangoFlux")
5
  def gradio_generate(prompt, steps, guidance,duration=10):
6
 
 
14
 
15
 
16
  return filename
17
+
18
+
19
  description_text = """
20
  * Powered by **Stability AI**
21
  Generate high quality and faithful audio in just a few seconds using <b>VOXIFY</b> by providing a text prompt. <b>VOXIFY</b> was trained from scratch and underwent alignment to follow human instructions using a new method called <b>CLAP-Ranked Preference Optimization (CRPO)</b>.
model.py CHANGED
@@ -1,25 +1,17 @@
1
- from transformers import T5EncoderModel,T5TokenizerFast
2
  import torch
 
3
  from diffusers import FluxTransformer2DModel
4
  from torch import nn
5
-
6
  from typing import List
7
  from diffusers import FlowMatchEulerDiscreteScheduler
8
- from diffusers.training_utils import compute_density_for_timestep_sampling
9
  import copy
10
  import torch.nn.functional as F
11
  import numpy as np
12
  from tqdm import tqdm
13
-
14
- from typing import Optional,Union,List
15
- from datasets import load_dataset, Audio
16
  from math import pi
17
  import inspect
18
- import yaml
19
- import random
20
-
21
-
22
 
 
23
  class StableAudioPositionalEmbedding(nn.Module):
24
  """Used for continuous time
25
  Adapted from stable audio open.
@@ -38,7 +30,6 @@ class StableAudioPositionalEmbedding(nn.Module):
38
  fouriered = torch.cat((freqs.sin(), freqs.cos()), dim=-1)
39
  fouriered = torch.cat((times, fouriered), dim=-1)
40
  return fouriered
41
-
42
  class DurationEmbedder(nn.Module):
43
  """
44
  A simple linear projection model to map numbers to a latent space.
@@ -350,157 +341,4 @@ class Voxify(nn.Module):
350
  latents = scheduler.step(noise_pred, t, latents).prev_sample
351
 
352
 
353
- return latents
354
-
355
- def forward(self,
356
- latents,
357
- prompt,
358
- duration=torch.tensor([10]),
359
- sft=True
360
- ):
361
-
362
-
363
- device = latents.device
364
- audio_seq_length = self.audio_seq_len
365
- bsz = latents.shape[0]
366
-
367
-
368
-
369
- encoder_hidden_states, boolean_encoder_mask = self.encode_text(prompt)
370
- duration_hidden_states = self.encode_duration(duration)
371
-
372
-
373
- mask_expanded = boolean_encoder_mask.unsqueeze(-1).expand_as(encoder_hidden_states)
374
- masked_data = torch.where(mask_expanded, encoder_hidden_states, torch.tensor(float('nan')))
375
- pooled = torch.nanmean(masked_data, dim=1)
376
- pooled_projection = self.fc(pooled)
377
-
378
- ## Add duration hidden states to encoder hidden states
379
- encoder_hidden_states = torch.cat([encoder_hidden_states,duration_hidden_states],dim=1) ## (bs,seq_len,dim)
380
-
381
- txt_ids = torch.zeros(bsz,encoder_hidden_states.shape[1],3).to(device)
382
- audio_ids = torch.arange(audio_seq_length).unsqueeze(0).unsqueeze(-1).repeat(bsz,1,3).to(device)
383
-
384
- if sft:
385
-
386
- if self.uncondition:
387
- mask_indices = [k for k in range(len(prompt)) if random.random() < 0.1]
388
- if len(mask_indices) > 0:
389
- encoder_hidden_states[mask_indices] = 0
390
-
391
-
392
- noise = torch.randn_like(latents)
393
-
394
-
395
- u = compute_density_for_timestep_sampling(
396
- weighting_scheme='logit_normal',
397
- batch_size=bsz,
398
- logit_mean=0,
399
- logit_std=1,
400
- mode_scale=None,
401
- )
402
-
403
-
404
- indices = (u * self.noise_scheduler_copy.config.num_train_timesteps).long()
405
- timesteps = self.noise_scheduler_copy.timesteps[indices].to(device=latents.device)
406
- sigmas = self.get_sigmas(timesteps, n_dim=latents.ndim, dtype=latents.dtype)
407
-
408
- noisy_model_input = (1.0 - sigmas) * latents + sigmas * noise
409
-
410
-
411
-
412
- model_pred = self.transformer(
413
- hidden_states=noisy_model_input,
414
- encoder_hidden_states=encoder_hidden_states,
415
- pooled_projections=pooled_projection,
416
- img_ids=audio_ids,
417
- txt_ids=txt_ids,
418
- guidance=None,
419
- # YiYi notes: divide it by 1000 for now because we scale it by 1000 in the transforme rmodel (we should not keep it but I want to keep the inputs same for the model for testing)
420
- timestep=timesteps/1000,
421
- return_dict=False)[0]
422
-
423
-
424
-
425
- target = noise - latents
426
- loss = torch.mean(
427
- ( (model_pred.float() - target.float()) ** 2).reshape(target.shape[0], -1),
428
- 1,
429
- )
430
- loss = loss.mean()
431
- raw_model_loss, raw_ref_loss,implicit_acc,epsilon_diff = 0,0,0,0 ## default this to 0 if doing sft
432
-
433
- else:
434
- encoder_hidden_states = encoder_hidden_states.repeat(2, 1, 1)
435
- pooled_projection = pooled_projection.repeat(2,1)
436
- noise = torch.randn_like(latents).chunk(2)[0].repeat(2, 1, 1) ## Have to sample same noise for preferred and rejected
437
- u = compute_density_for_timestep_sampling(
438
- weighting_scheme='logit_normal',
439
- batch_size=bsz//2,
440
- logit_mean=0,
441
- logit_std=1,
442
- mode_scale=None,
443
- )
444
-
445
-
446
- indices = (u * self.noise_scheduler_copy.config.num_train_timesteps).long()
447
- timesteps = self.noise_scheduler_copy.timesteps[indices].to(device=latents.device)
448
- timesteps = timesteps.repeat(2)
449
- sigmas = self.get_sigmas(timesteps, n_dim=latents.ndim, dtype=latents.dtype)
450
-
451
- noisy_model_input = (1.0 - sigmas) * latents + sigmas * noise
452
-
453
- model_pred = self.transformer(
454
- hidden_states=noisy_model_input,
455
- encoder_hidden_states=encoder_hidden_states,
456
- pooled_projections=pooled_projection,
457
- img_ids=audio_ids,
458
- txt_ids=txt_ids,
459
- guidance=None,
460
- # YiYi notes: divide it by 1000 for now because we scale it by 1000 in the transforme rmodel (we should not keep it but I want to keep the inputs same for the model for testing)
461
- timestep=timesteps/1000,
462
- return_dict=False)[0]
463
- target = noise - latents
464
-
465
- model_losses = F.mse_loss(model_pred.float(), target.float(), reduction="none")
466
- model_losses = model_losses.mean(dim=list(range(1, len(model_losses.shape))))
467
- model_losses_w, model_losses_l = model_losses.chunk(2)
468
- model_diff = model_losses_w - model_losses_l
469
- raw_model_loss = 0.5 * (model_losses_w.mean() + model_losses_l.mean())
470
-
471
-
472
- with torch.no_grad():
473
- ref_preds = self.ref_transformer(
474
- hidden_states=noisy_model_input,
475
- encoder_hidden_states=encoder_hidden_states,
476
- pooled_projections=pooled_projection,
477
- img_ids=audio_ids,
478
- txt_ids=txt_ids,
479
- guidance=None,
480
- timestep=timesteps/1000,
481
- return_dict=False)[0]
482
-
483
-
484
- ref_loss = F.mse_loss(ref_preds.float(), target.float(), reduction="none")
485
- ref_loss = ref_loss.mean(dim=list(range(1, len(ref_loss.shape))))
486
-
487
- ref_losses_w, ref_losses_l = ref_loss.chunk(2)
488
- ref_diff = ref_losses_w - ref_losses_l
489
- raw_ref_loss = ref_loss.mean()
490
-
491
-
492
-
493
-
494
-
495
- epsilon_diff = torch.max(torch.zeros_like(model_losses_w),
496
- ref_losses_w-model_losses_w).mean()
497
-
498
-
499
-
500
- scale_term = -0.5 * self.beta_dpo
501
- inside_term = scale_term * (model_diff - ref_diff)
502
- implicit_acc = (scale_term * (model_diff - ref_diff) > 0).sum().float() / inside_term.size(0)
503
- loss = -1 * F.logsigmoid(inside_term).mean() + model_losses_w.mean()
504
-
505
-
506
- return loss, raw_model_loss, raw_ref_loss, implicit_acc,epsilon_diff
 
 
1
  import torch
2
+ from transformers import T5EncoderModel,T5TokenizerFast
3
  from diffusers import FluxTransformer2DModel
4
  from torch import nn
 
5
  from typing import List
6
  from diffusers import FlowMatchEulerDiscreteScheduler
 
7
  import copy
8
  import torch.nn.functional as F
9
  import numpy as np
10
  from tqdm import tqdm
 
 
 
11
  from math import pi
12
  import inspect
 
 
 
 
13
 
14
+ from typing import Optional,Union,List
15
  class StableAudioPositionalEmbedding(nn.Module):
16
  """Used for continuous time
17
  Adapted from stable audio open.
 
30
  fouriered = torch.cat((freqs.sin(), freqs.cos()), dim=-1)
31
  fouriered = torch.cat((times, fouriered), dim=-1)
32
  return fouriered
 
33
  class DurationEmbedder(nn.Module):
34
  """
35
  A simple linear projection model to map numbers to a latent space.
 
341
  latents = scheduler.step(noise_pred, t, latents).prev_sample
342
 
343
 
344
+ return latents
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
setup.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ import os
2
+ requirement_path = "requirements.txt"
3
+ install_requires = []
4
+ if os.path.isfile(requirement_path):
5
+ with open(requirement_path) as f:
6
+ install_requires = f.read().splitlines()
7
+ setup(name="mypackage", install_requires=install_requires, [...])