Spaces:

lshzhm
/

Video-to-Audio-and-Piano

Running

App Files Files Community

Video-to-Audio-and-Piano / src /e2_tts_pytorch /e2_tts_crossatt3_2.py

lshzhm

fps

53593c1 3 months ago

raw

history blame contribute delete

116 kB

	"""
	ein notation:
	b - batch
	n - sequence
	nt - text sequence
	nw - raw wave length
	d - dimension
	dt - dimension text
	"""

	from __future__ import annotations

	from pathlib import Path
	from random import random
	from functools import partial
	from itertools import zip_longest
	from collections import namedtuple

	from typing import Literal, Callable

	import jaxtyping
	from beartype import beartype

	import torch
	import torch.nn.functional as F
	from torch import nn, tensor, Tensor, from_numpy
	from torch.nn import Module, ModuleList, Sequential, Linear
	from torch.nn.utils.rnn import pad_sequence

	import torchaudio
	from torchaudio.functional import DB_to_amplitude
	from torchdiffeq import odeint

	import einx
	from einops.layers.torch import Rearrange
	from einops import rearrange, repeat, reduce, pack, unpack

	from x_transformers import (
	Attention,
	FeedForward,
	RMSNorm,
	AdaptiveRMSNorm,
	)

	from x_transformers.x_transformers import RotaryEmbedding

	import sys
	sys.path.insert(0, "/zhanghaomin/codes3/vocos-main/")
	from vocos import Vocos

	from transformers import AutoTokenizer
	from transformers import T5EncoderModel
	from transformers import EncodecModel, AutoProcessor

	sys.path.insert(0, "./src/audeo/")
	import Video2RollNet
	import torchvision.transforms as transforms


	####transform = transforms.Compose([lambda x: x.resize((900,100)),
	#### lambda x: np.reshape(x,(100,900,1)),
	#### lambda x: np.transpose(x,[2,0,1]),
	#### lambda x: x/255.])

	transform = transforms.Compose([lambda x: x.resize((100,900)),
	lambda x: np.reshape(x,(900,100,1)),
	lambda x: np.transpose(x,[2,1,0]),
	lambda x: x/255.])

	####NOTES = 51
	####NOTTE_MIN = 15
	####NOTE_MAX = 65

	NOTES = 88
	NOTTE_MIN = 0#15
	NOTE_MAX = 87#72


	import os
	import math
	import traceback
	import numpy as np
	from moviepy.editor import AudioFileClip, VideoFileClip
	from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection
	#import open_clip
	from transformers import AutoImageProcessor, AutoModel
	from PIL import Image
	import time

	import warnings
	warnings.filterwarnings("ignore")

	def normalize_wav(waveform):
	waveform = waveform - torch.mean(waveform)
	waveform = waveform / (torch.max(torch.abs(waveform[0, :])) + 1e-8)
	return waveform * 0.5

	def read_frames_with_moviepy(video_path, max_frame_nums=None):
	try:
	clip = VideoFileClip(video_path)
	duration = clip.duration
	frames = []
	for frame in clip.iter_frames():
	frames.append(frame)
	except:
	print("Error read_frames_with_moviepy", video_path)
	traceback.print_exc()
	return None, None
	if max_frame_nums is not None:
	frames_idx = np.linspace(0, len(frames) - 1, max_frame_nums, dtype=int)
	return np.array(frames)[frames_idx, ...], duration
	else:
	return np.array(frames), duration

	pad_sequence = partial(pad_sequence, batch_first = True)

	# constants

	class TorchTyping:
	def __init__(self, abstract_dtype):
	self.abstract_dtype = abstract_dtype

	def __getitem__(self, shapes: str):
	return self.abstract_dtype[Tensor, shapes]

	Float = TorchTyping(jaxtyping.Float)
	Int = TorchTyping(jaxtyping.Int)
	Bool = TorchTyping(jaxtyping.Bool)

	# named tuples

	LossBreakdown = namedtuple('LossBreakdown', ['flow', 'velocity_consistency', 'a', 'b'])

	E2TTSReturn = namedtuple('E2TTS', ['loss', 'cond', 'pred_flow', 'pred_data', 'loss_breakdown'])

	# helpers

	def exists(v):
	return v is not None

	def default(v, d):
	return v if exists(v) else d

	def divisible_by(num, den):
	return (num % den) == 0

	def pack_one_with_inverse(x, pattern):
	packed, packed_shape = pack([x], pattern)

	def inverse(x, inverse_pattern = None):
	inverse_pattern = default(inverse_pattern, pattern)
	return unpack(x, packed_shape, inverse_pattern)[0]

	return packed, inverse

	class Identity(Module):
	def forward(self, x, **kwargs):
	return x

	# tensor helpers

	def project(x, y):
	x, inverse = pack_one_with_inverse(x, 'b *')
	y, _ = pack_one_with_inverse(y, 'b *')

	dtype = x.dtype
	x, y = x.double(), y.double()
	unit = F.normalize(y, dim = -1)

	parallel = (x * unit).sum(dim = -1, keepdim = True) * unit
	orthogonal = x - parallel

	return inverse(parallel).to(dtype), inverse(orthogonal).to(dtype)

	# simple utf-8 tokenizer, since paper went character based

	def list_str_to_tensor(
	text: list[str],
	padding_value = -1
	) -> Int['b nt']:

	list_tensors = [tensor([*bytes(t, 'UTF-8')]) for t in text]
	padded_tensor = pad_sequence(list_tensors, padding_value = -1)
	return padded_tensor

	# simple english phoneme-based tokenizer

	from g2p_en import G2p
	import jieba
	from pypinyin import lazy_pinyin, Style

	def get_g2p_en_encode():
	g2p = G2p()

	# used by @lucasnewman successfully here
	# https://github.com/lucasnewman/e2-tts-pytorch/blob/ljspeech-test/e2_tts_pytorch/e2_tts.py

	phoneme_to_index = g2p.p2idx
	num_phonemes = len(phoneme_to_index)

	extended_chars = [' ', ',', '.', '-', '!', '?', '\'', '"', '...', '..', '. .', '. . .', '. . . .', '. . . . .', '. ...', '... .', '.. ..']
	num_extended_chars = len(extended_chars)

	extended_chars_dict = {p: (num_phonemes + i) for i, p in enumerate(extended_chars)}
	phoneme_to_index = {phoneme_to_index, extended_chars_dict}

	def encode(
	text: list[str],
	padding_value = -1
	) -> Int['b nt']:

	phonemes = [g2p(t) for t in text]
	list_tensors = [tensor([phoneme_to_index[p] for p in one_phoneme]) for one_phoneme in phonemes]
	padded_tensor = pad_sequence(list_tensors, padding_value = -1)
	return padded_tensor

	return encode, (num_phonemes + num_extended_chars)

	def all_en(word):
	res = word.replace("'", "").encode('utf-8').isalpha()
	return res

	def all_ch(word):
	res = True
	for w in word:
	if not '\u4e00' <= w <= '\u9fff':
	res = False
	return res

	def get_g2p_zh_encode():
	puncs = ['，', '。', '？', '、']
	pinyins = ['a', 'a1', 'ai1', 'ai2', 'ai3', 'ai4', 'an1', 'an3', 'an4', 'ang1', 'ang2', 'ang4', 'ao1', 'ao2', 'ao3', 'ao4', 'ba', 'ba1', 'ba2', 'ba3', 'ba4', 'bai1', 'bai2', 'bai3', 'bai4', 'ban1', 'ban2', 'ban3', 'ban4', 'bang1', 'bang2', 'bang3', 'bang4', 'bao1', 'bao2', 'bao3', 'bao4', 'bei', 'bei1', 'bei2', 'bei3', 'bei4', 'ben1', 'ben2', 'ben3', 'ben4', 'beng1', 'beng2', 'beng4', 'bi1', 'bi2', 'bi3', 'bi4', 'bian1', 'bian2', 'bian3', 'bian4', 'biao1', 'biao2', 'biao3', 'bie1', 'bie2', 'bie3', 'bie4', 'bin1', 'bin4', 'bing1', 'bing2', 'bing3', 'bing4', 'bo', 'bo1', 'bo2', 'bo3', 'bo4', 'bu2', 'bu3', 'bu4', 'ca1', 'cai1', 'cai2', 'cai3', 'cai4', 'can1', 'can2', 'can3', 'can4', 'cang1', 'cang2', 'cao1', 'cao2', 'cao3', 'ce4', 'cen1', 'cen2', 'ceng1', 'ceng2', 'ceng4', 'cha1', 'cha2', 'cha3', 'cha4', 'chai1', 'chai2', 'chan1', 'chan2', 'chan3', 'chan4', 'chang1', 'chang2', 'chang3', 'chang4', 'chao1', 'chao2', 'chao3', 'che1', 'che2', 'che3', 'che4', 'chen1', 'chen2', 'chen3', 'chen4', 'cheng1', 'cheng2', 'cheng3', 'cheng4', 'chi1', 'chi2', 'chi3', 'chi4', 'chong1', 'chong2', 'chong3', 'chong4', 'chou1', 'chou2', 'chou3', 'chou4', 'chu1', 'chu2', 'chu3', 'chu4', 'chua1', 'chuai1', 'chuai2', 'chuai3', 'chuai4', 'chuan1', 'chuan2', 'chuan3', 'chuan4', 'chuang1', 'chuang2', 'chuang3', 'chuang4', 'chui1', 'chui2', 'chun1', 'chun2', 'chun3', 'chuo1', 'chuo4', 'ci1', 'ci2', 'ci3', 'ci4', 'cong1', 'cong2', 'cou4', 'cu1', 'cu4', 'cuan1', 'cuan2', 'cuan4', 'cui1', 'cui3', 'cui4', 'cun1', 'cun2', 'cun4', 'cuo1', 'cuo2', 'cuo4', 'da', 'da1', 'da2', 'da3', 'da4', 'dai1', 'dai3', 'dai4', 'dan1', 'dan2', 'dan3', 'dan4', 'dang1', 'dang2', 'dang3', 'dang4', 'dao1', 'dao2', 'dao3', 'dao4', 'de', 'de1', 'de2', 'dei3', 'den4', 'deng1', 'deng2', 'deng3', 'deng4', 'di1', 'di2', 'di3', 'di4', 'dia3', 'dian1', 'dian2', 'dian3', 'dian4', 'diao1', 'diao3', 'diao4', 'die1', 'die2', 'ding1', 'ding2', 'ding3', 'ding4', 'diu1', 'dong1', 'dong3', 'dong4', 'dou1', 'dou2', 'dou3', 'dou4', 'du1', 'du2', 'du3', 'du4', 'duan1', 'duan2', 'duan3', 'duan4', 'dui1', 'dui4', 'dun1', 'dun3', 'dun4', 'duo1', 'duo2', 'duo3', 'duo4', 'e1', 'e2', 'e3', 'e4', 'ei2', 'en1', 'en4', 'er', 'er2', 'er3', 'er4', 'fa1', 'fa2', 'fa3', 'fa4', 'fan1', 'fan2', 'fan3', 'fan4', 'fang1', 'fang2', 'fang3', 'fang4', 'fei1', 'fei2', 'fei3', 'fei4', 'fen1', 'fen2', 'fen3', 'fen4', 'feng1', 'feng2', 'feng3', 'feng4', 'fo2', 'fou2', 'fou3', 'fu1', 'fu2', 'fu3', 'fu4', 'ga1', 'ga2', 'ga4', 'gai1', 'gai3', 'gai4', 'gan1', 'gan2', 'gan3', 'gan4', 'gang1', 'gang2', 'gang3', 'gang4', 'gao1', 'gao2', 'gao3', 'gao4', 'ge1', 'ge2', 'ge3', 'ge4', 'gei2', 'gei3', 'gen1', 'gen2', 'gen3', 'gen4', 'geng1', 'geng3', 'geng4', 'gong1', 'gong3', 'gong4', 'gou1', 'gou2', 'gou3', 'gou4', 'gu', 'gu1', 'gu2', 'gu3', 'gu4', 'gua1', 'gua2', 'gua3', 'gua4', 'guai1', 'guai2', 'guai3', 'guai4', 'guan1', 'guan2', 'guan3', 'guan4', 'guang1', 'guang2', 'guang3', 'guang4', 'gui1', 'gui2', 'gui3', 'gui4', 'gun3', 'gun4', 'guo1', 'guo2', 'guo3', 'guo4', 'ha1', 'ha2', 'ha3', 'hai1', 'hai2', 'hai3', 'hai4', 'han1', 'han2', 'han3', 'han4', 'hang1', 'hang2', 'hang4', 'hao1', 'hao2', 'hao3', 'hao4', 'he1', 'he2', 'he4', 'hei1', 'hen2', 'hen3', 'hen4', 'heng1', 'heng2', 'heng4', 'hong1', 'hong2', 'hong3', 'hong4', 'hou1', 'hou2', 'hou3', 'hou4', 'hu1', 'hu2', 'hu3', 'hu4', 'hua1', 'hua2', 'hua4', 'huai2', 'huai4', 'huan1', 'huan2', 'huan3', 'huan4', 'huang1', 'huang2', 'huang3', 'huang4', 'hui1', 'hui2', 'hui3', 'hui4', 'hun1', 'hun2', 'hun4', 'huo', 'huo1', 'huo2', 'huo3', 'huo4', 'ji1', 'ji2', 'ji3', 'ji4', 'jia', 'jia1', 'jia2', 'jia3', 'jia4', 'jian1', 'jian2', 'jian3', 'jian4', 'jiang1', 'jiang2', 'jiang3', 'jiang4', 'jiao1', 'jiao2', 'jiao3', 'jiao4', 'jie1', 'jie2', 'jie3', 'jie4', 'jin1', 'jin2', 'jin3', 'jin4', 'jing1', 'jing2', 'jing3', 'jing4', 'jiong3', 'jiu1', 'jiu2', 'jiu3', 'jiu4', 'ju1', 'ju2', 'ju3', 'ju4', 'juan1', 'juan2', 'juan3', 'juan4', 'jue1', 'jue2', 'jue4', 'jun1', 'jun4', 'ka1', 'ka2', 'ka3', 'kai1', 'kai2', 'kai3', 'kai4', 'kan1', 'kan2', 'kan3', 'kan4', 'kang1', 'kang2', 'kang4', 'kao2', 'kao3', 'kao4', 'ke1', 'ke2', 'ke3', 'ke4', 'ken3', 'keng1', 'kong1', 'kong3', 'kong4', 'kou1', 'kou2', 'kou3', 'kou4', 'ku1', 'ku2', 'ku3', 'ku4', 'kua1', 'kua3', 'kua4', 'kuai3', 'kuai4', 'kuan1', 'kuan2', 'kuan3', 'kuang1', 'kuang2', 'kuang4', 'kui1', 'kui2', 'kui3', 'kui4', 'kun1', 'kun3', 'kun4', 'kuo4', 'la', 'la1', 'la2', 'la3', 'la4', 'lai2', 'lai4', 'lan2', 'lan3', 'lan4', 'lang1', 'lang2', 'lang3', 'lang4', 'lao1', 'lao2', 'lao3', 'lao4', 'le', 'le1', 'le4', 'lei', 'lei1', 'lei2', 'lei3', 'lei4', 'leng1', 'leng2', 'leng3', 'leng4', 'li', 'li1', 'li2', 'li3', 'li4', 'lia3', 'lian2', 'lian3', 'lian4', 'liang2', 'liang3', 'liang4', 'liao1', 'liao2', 'liao3', 'liao4', 'lie1', 'lie2', 'lie3', 'lie4', 'lin1', 'lin2', 'lin3', 'lin4', 'ling2', 'ling3', 'ling4', 'liu1', 'liu2', 'liu3', 'liu4', 'long1', 'long2', 'long3', 'long4', 'lou1', 'lou2', 'lou3', 'lou4', 'lu1', 'lu2', 'lu3', 'lu4', 'luan2', 'luan3', 'luan4', 'lun1', 'lun2', 'lun4', 'luo1', 'luo2', 'luo3', 'luo4', 'lv2', 'lv3', 'lv4', 'lve3', 'lve4', 'ma', 'ma1', 'ma2', 'ma3', 'ma4', 'mai2', 'mai3', 'mai4', 'man2', 'man3', 'man4', 'mang2', 'mang3', 'mao1', 'mao2', 'mao3', 'mao4', 'me', 'mei2', 'mei3', 'mei4', 'men', 'men1', 'men2', 'men4', 'meng1', 'meng2', 'meng3', 'meng4', 'mi1', 'mi2', 'mi3', 'mi4', 'mian2', 'mian3', 'mian4', 'miao1', 'miao2', 'miao3', 'miao4', 'mie1', 'mie4', 'min2', 'min3', 'ming2', 'ming3', 'ming4', 'miu4', 'mo1', 'mo2', 'mo3', 'mo4', 'mou1', 'mou2', 'mou3', 'mu2', 'mu3', 'mu4', 'n2', 'na1', 'na2', 'na3', 'na4', 'nai2', 'nai3', 'nai4', 'nan1', 'nan2', 'nan3', 'nan4', 'nang1', 'nang2', 'nao1', 'nao2', 'nao3', 'nao4', 'ne', 'ne2', 'ne4', 'nei3', 'nei4', 'nen4', 'neng2', 'ni1', 'ni2', 'ni3', 'ni4', 'nian1', 'nian2', 'nian3', 'nian4', 'niang2', 'niang4', 'niao2', 'niao3', 'niao4', 'nie1', 'nie4', 'nin2', 'ning2', 'ning3', 'ning4', 'niu1', 'niu2', 'niu3', 'niu4', 'nong2', 'nong4', 'nou4', 'nu2', 'nu3', 'nu4', 'nuan3', 'nuo2', 'nuo4', 'nv2', 'nv3', 'nve4', 'o1', 'o2', 'ou1', 'ou3', 'ou4', 'pa1', 'pa2', 'pa4', 'pai1', 'pai2', 'pai3', 'pai4', 'pan1', 'pan2', 'pan4', 'pang1', 'pang2', 'pang4', 'pao1', 'pao2', 'pao3', 'pao4', 'pei1', 'pei2', 'pei4', 'pen1', 'pen2', 'pen4', 'peng1', 'peng2', 'peng3', 'peng4', 'pi1', 'pi2', 'pi3', 'pi4', 'pian1', 'pian2', 'pian4', 'piao1', 'piao2', 'piao3', 'piao4', 'pie1', 'pie2', 'pie3', 'pin1', 'pin2', 'pin3', 'pin4', 'ping1', 'ping2', 'po1', 'po2', 'po3', 'po4', 'pou1', 'pu1', 'pu2', 'pu3', 'pu4', 'qi1', 'qi2', 'qi3', 'qi4', 'qia1', 'qia3', 'qia4', 'qian1', 'qian2', 'qian3', 'qian4', 'qiang1', 'qiang2', 'qiang3', 'qiang4', 'qiao1', 'qiao2', 'qiao3', 'qiao4', 'qie1', 'qie2', 'qie3', 'qie4', 'qin1', 'qin2', 'qin3', 'qin4', 'qing1', 'qing2', 'qing3', 'qing4', 'qiong1', 'qiong2', 'qiu1', 'qiu2', 'qiu3', 'qu1', 'qu2', 'qu3', 'qu4', 'quan1', 'quan2', 'quan3', 'quan4', 'que1', 'que2', 'que4', 'qun2', 'ran2', 'ran3', 'rang1', 'rang2', 'rang3', 'rang4', 'rao2', 'rao3', 'rao4', 're2', 're3', 're4', 'ren2', 'ren3', 'ren4', 'reng1', 'reng2', 'ri4', 'rong1', 'rong2', 'rong3', 'rou2', 'rou4', 'ru2', 'ru3', 'ru4', 'ruan2', 'ruan3', 'rui3', 'rui4', 'run4', 'ruo4', 'sa1', 'sa2', 'sa3', 'sa4', 'sai1', 'sai4', 'san1', 'san2', 'san3', 'san4', 'sang1', 'sang3', 'sang4', 'sao1', 'sao2', 'sao3', 'sao4', 'se4', 'sen1', 'seng1', 'sha1', 'sha2', 'sha3', 'sha4', 'shai1', 'shai2', 'shai3', 'shai4', 'shan1', 'shan3', 'shan4', 'shang', 'shang1', 'shang3', 'shang4', 'shao1', 'shao2', 'shao3', 'shao4', 'she1', 'she2', 'she3', 'she4', 'shei2', 'shen1', 'shen2', 'shen3', 'shen4', 'sheng1', 'sheng2', 'sheng3', 'sheng4', 'shi', 'shi1', 'shi2', 'shi3', 'shi4', 'shou1', 'shou2', 'shou3', 'shou4', 'shu1', 'shu2', 'shu3', 'shu4', 'shua1', 'shua2', 'shua3', 'shua4', 'shuai1', 'shuai3', 'shuai4', 'shuan1', 'shuan4', 'shuang1', 'shuang3', 'shui2', 'shui3', 'shui4', 'shun3', 'shun4', 'shuo1', 'shuo4', 'si1', 'si2', 'si3', 'si4', 'song1', 'song3', 'song4', 'sou1', 'sou3', 'sou4', 'su1', 'su2', 'su4', 'suan1', 'suan4', 'sui1', 'sui2', 'sui3', 'sui4', 'sun1', 'sun3', 'suo', 'suo1', 'suo2', 'suo3', 'ta1', 'ta3', 'ta4', 'tai1', 'tai2', 'tai4', 'tan1', 'tan2', 'tan3', 'tan4', 'tang1', 'tang2', 'tang3', 'tang4', 'tao1', 'tao2', 'tao3', 'tao4', 'te4', 'teng2', 'ti1', 'ti2', 'ti3', 'ti4', 'tian1', 'tian2', 'tian3', 'tiao1', 'tiao2', 'tiao3', 'tiao4', 'tie1', 'tie2', 'tie3', 'tie4', 'ting1', 'ting2', 'ting3', 'tong1', 'tong2', 'tong3', 'tong4', 'tou', 'tou1', 'tou2', 'tou4', 'tu1', 'tu2', 'tu3', 'tu4', 'tuan1', 'tuan2', 'tui1', 'tui2', 'tui3', 'tui4', 'tun1', 'tun2', 'tun4', 'tuo1', 'tuo2', 'tuo3', 'tuo4', 'wa', 'wa1', 'wa2', 'wa3', 'wa4', 'wai1', 'wai3', 'wai4', 'wan1', 'wan2', 'wan3', 'wan4', 'wang1', 'wang2', 'wang3', 'wang4', 'wei1', 'wei2', 'wei3', 'wei4', 'wen1', 'wen2', 'wen3', 'wen4', 'weng1', 'weng4', 'wo1', 'wo3', 'wo4', 'wu1', 'wu2', 'wu3', 'wu4', 'xi1', 'xi2', 'xi3', 'xi4', 'xia1', 'xia2', 'xia4', 'xian1', 'xian2', 'xian3', 'xian4', 'xiang1', 'xiang2', 'xiang3', 'xiang4', 'xiao1', 'xiao2', 'xiao3', 'xiao4', 'xie1', 'xie2', 'xie3', 'xie4', 'xin1', 'xin2', 'xin4', 'xing1', 'xing2', 'xing3', 'xing4', 'xiong1', 'xiong2', 'xiu1', 'xiu3', 'xiu4', 'xu', 'xu1', 'xu2', 'xu3', 'xu4', 'xuan1', 'xuan2', 'xuan3', 'xuan4', 'xue1', 'xue2', 'xue3', 'xue4', 'xun1', 'xun2', 'xun4', 'ya', 'ya1', 'ya2', 'ya3', 'ya4', 'yan1', 'yan2', 'yan3', 'yan4', 'yang1', 'yang2', 'yang3', 'yang4', 'yao1', 'yao2', 'yao3', 'yao4', 'ye1', 'ye2', 'ye3', 'ye4', 'yi1', 'yi2', 'yi3', 'yi4', 'yin1', 'yin2', 'yin3', 'yin4', 'ying1', 'ying2', 'ying3', 'ying4', 'yo1', 'yong1', 'yong3', 'yong4', 'you1', 'you2', 'you3', 'you4', 'yu1', 'yu2', 'yu3', 'yu4', 'yuan1', 'yuan2', 'yuan3', 'yuan4', 'yue1', 'yue4', 'yun1', 'yun2', 'yun3', 'yun4', 'za1', 'za2', 'za3', 'zai1', 'zai3', 'zai4', 'zan1', 'zan2', 'zan3', 'zan4', 'zang1', 'zang4', 'zao1', 'zao2', 'zao3', 'zao4', 'ze2', 'ze4', 'zei2', 'zen3', 'zeng1', 'zeng4', 'zha1', 'zha2', 'zha3', 'zha4', 'zhai1', 'zhai2', 'zhai3', 'zhai4', 'zhan1', 'zhan2', 'zhan3', 'zhan4', 'zhang1', 'zhang2', 'zhang3', 'zhang4', 'zhao1', 'zhao2', 'zhao3', 'zhao4', 'zhe', 'zhe1', 'zhe2', 'zhe3', 'zhe4', 'zhen1', 'zhen2', 'zhen3', 'zhen4', 'zheng1', 'zheng2', 'zheng3', 'zheng4', 'zhi1', 'zhi2', 'zhi3', 'zhi4', 'zhong1', 'zhong2', 'zhong3', 'zhong4', 'zhou1', 'zhou2', 'zhou3', 'zhou4', 'zhu1', 'zhu2', 'zhu3', 'zhu4', 'zhua1', 'zhua2', 'zhua3', 'zhuai1', 'zhuai3', 'zhuai4', 'zhuan1', 'zhuan2', 'zhuan3', 'zhuan4', 'zhuang1', 'zhuang4', 'zhui1', 'zhui4', 'zhun1', 'zhun2', 'zhun3', 'zhuo1', 'zhuo2', 'zi', 'zi1', 'zi2', 'zi3', 'zi4', 'zong1', 'zong2', 'zong3', 'zong4', 'zou1', 'zou2', 'zou3', 'zou4', 'zu1', 'zu2', 'zu3', 'zuan1', 'zuan3', 'zuan4', 'zui2', 'zui3', 'zui4', 'zun1', 'zuo1', 'zuo2', 'zuo3', 'zuo4']
	ens = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', "'", ' ']
	ens_U = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z']

	phoneme_to_index = {}
	num_phonemes = 0
	for index, punc in enumerate(puncs):
	phoneme_to_index[punc] = index + num_phonemes
	num_phonemes += len(puncs)
	for index, pinyin in enumerate(pinyins):
	phoneme_to_index[pinyin] = index + num_phonemes
	num_phonemes += len(pinyins)
	for index, en in enumerate(ens):
	phoneme_to_index[en] = index + num_phonemes
	for index, en in enumerate(ens_U):
	phoneme_to_index[en] = index + num_phonemes
	num_phonemes += len(ens)
	#print(num_phonemes, phoneme_to_index) # 1342

	def encode(
	text: list[str],
	padding_value = -1
	) -> Int['b nt']:
	phonemes = []
	for t in text:
	one_phoneme = []
	brk = False
	for word in jieba.cut(t):
	if all_ch(word):
	seg = lazy_pinyin(word, style=Style.TONE3, tone_sandhi=True)
	one_phoneme.extend(seg)
	elif all_en(word):
	for seg in word:
	one_phoneme.append(seg)
	elif word in ["，", "。", "？", "、", "'", " "]:
	one_phoneme.append(word)
	else:
	for ch in word:
	if all_ch(ch):
	seg = lazy_pinyin(ch, style=Style.TONE3, tone_sandhi=True)
	one_phoneme.extend(seg)
	elif all_en(ch):
	for seg in ch:
	one_phoneme.append(seg)
	else:
	brk = True
	break
	if brk:
	break
	if not brk:
	phonemes.append(one_phoneme)
	else:
	print("Error Tokenized", t, list(jieba.cut(t)))
	list_tensors = [tensor([phoneme_to_index[p] for p in one_phoneme]) for one_phoneme in phonemes]
	padded_tensor = pad_sequence(list_tensors, padding_value = -1)
	return padded_tensor

	return encode, num_phonemes

	# tensor helpers

	def log(t, eps = 1e-5):
	return t.clamp(min = eps).log()

	def lens_to_mask(
	t: Int['b'],
	length: int \| None = None
	) -> Bool['b n']:

	if not exists(length):
	length = t.amax()

	seq = torch.arange(length, device = t.device)
	return einx.less('n, b -> b n', seq, t)

	def mask_from_start_end_indices(
	seq_len: Int['b'],
	start: Int['b'],
	end: Int['b']
	):
	max_seq_len = seq_len.max().item()
	seq = torch.arange(max_seq_len, device = start.device).long()
	return einx.greater_equal('n, b -> b n', seq, start) & einx.less('n, b -> b n', seq, end)

	def mask_from_frac_lengths(
	seq_len: Int['b'],
	frac_lengths: Float['b'],
	max_length: int \| None = None,
	val = False
	):
	lengths = (frac_lengths * seq_len).long()
	max_start = seq_len - lengths

	if not val:
	rand = torch.rand_like(frac_lengths)
	else:
	rand = torch.tensor([0.5]*frac_lengths.shape[0], device=frac_lengths.device).float()
	start = (max_start * rand).long().clamp(min = 0)
	end = start + lengths

	out = mask_from_start_end_indices(seq_len, start, end)

	if exists(max_length):
	out = pad_to_length(out, max_length)

	return out

	def maybe_masked_mean(
	t: Float['b n d'],
	mask: Bool['b n'] \| None = None
	) -> Float['b d']:

	if not exists(mask):
	return t.mean(dim = 1)

	t = einx.where('b n, b n d, -> b n d', mask, t, 0.)
	num = reduce(t, 'b n d -> b d', 'sum')
	den = reduce(mask.float(), 'b n -> b', 'sum')

	return einx.divide('b d, b -> b d', num, den.clamp(min = 1.))

	def pad_to_length(
	t: Tensor,
	length: int,
	value = None
	):
	seq_len = t.shape[-1]
	if length > seq_len:
	t = F.pad(t, (0, length - seq_len), value = value)

	return t[..., :length]

	def interpolate_1d(
	x: Tensor,
	length: int,
	mode = 'bilinear'
	):
	x = rearrange(x, 'n d -> 1 d n 1')
	x = F.interpolate(x, (length, 1), mode = mode)
	return rearrange(x, '1 d n 1 -> n d')

	# to mel spec

	class MelSpec(Module):
	def __init__(
	self,
	filter_length = 1024,
	hop_length = 256,
	win_length = 1024,
	n_mel_channels = 100,
	sampling_rate = 24_000,
	normalize = False,
	power = 1,
	norm = None,
	center = True,
	):
	super().__init__()
	self.n_mel_channels = n_mel_channels
	self.sampling_rate = sampling_rate

	self.mel_stft = torchaudio.transforms.MelSpectrogram(
	sample_rate = sampling_rate,
	n_fft = filter_length,
	win_length = win_length,
	hop_length = hop_length,
	n_mels = n_mel_channels,
	power = power,
	center = center,
	normalized = normalize,
	norm = norm,
	)

	self.register_buffer('dummy', tensor(0), persistent = False)

	def forward(self, inp):
	if len(inp.shape) == 3:
	inp = rearrange(inp, 'b 1 nw -> b nw')

	assert len(inp.shape) == 2

	if self.dummy.device != inp.device:
	self.to(inp.device)

	mel = self.mel_stft(inp)
	mel = log(mel)
	return mel

	class EncodecWrapper(Module):
	def __init__(self, path):
	super().__init__()
	self.model = EncodecModel.from_pretrained(path)
	self.processor = AutoProcessor.from_pretrained(path)
	for param in self.model.parameters():
	param.requires_grad = False
	self.model.eval()

	def forward(self, waveform):
	with torch.no_grad():
	inputs = self.processor(raw_audio=waveform[0], sampling_rate=self.processor.sampling_rate, return_tensors="pt")
	emb = self.model.encoder(inputs.input_values)
	return emb

	def decode(self, emb):
	with torch.no_grad():
	output = self.model.decoder(emb)
	return output[0]

	from audioldm.audio.stft import TacotronSTFT
	from audioldm.variational_autoencoder import AutoencoderKL
	from audioldm.utils import default_audioldm_config, get_metadata

	def build_pretrained_models(name):
	checkpoint = torch.load(get_metadata()[name]["path"], map_location="cpu")
	scale_factor = checkpoint["state_dict"]["scale_factor"].item()

	vae_state_dict = {k[18:]: v for k, v in checkpoint["state_dict"].items() if "first_stage_model." in k}

	config = default_audioldm_config(name)
	vae_config = config["model"]["params"]["first_stage_config"]["params"]
	vae_config["scale_factor"] = scale_factor

	vae = AutoencoderKL(**vae_config)
	vae.load_state_dict(vae_state_dict)

	fn_STFT = TacotronSTFT(
	config["preprocessing"]["stft"]["filter_length"],
	config["preprocessing"]["stft"]["hop_length"],
	config["preprocessing"]["stft"]["win_length"],
	config["preprocessing"]["mel"]["n_mel_channels"],
	config["preprocessing"]["audio"]["sampling_rate"],
	config["preprocessing"]["mel"]["mel_fmin"],
	config["preprocessing"]["mel"]["mel_fmax"],
	)

	vae.eval()
	fn_STFT.eval()
	return vae, fn_STFT

	class VaeWrapper(Module):
	def __init__(self):
	super().__init__()
	vae, stft = build_pretrained_models("audioldm-s-full")
	vae.eval()
	stft.eval()
	stft = stft.cpu()
	self.vae = vae
	for param in self.vae.parameters():
	param.requires_grad = False

	def forward(self, waveform):
	return None

	def decode(self, emb):
	with torch.no_grad():
	b, d, l = emb.shape
	latents = emb.transpose(1,2).reshape(b, l, 8, 16).transpose(1,2)
	mel = self.vae.decode_first_stage(latents)
	wave = self.vae.decode_to_waveform(mel)
	return wave

	# convolutional positional generating module
	# taken from https://github.com/lucidrains/voicebox-pytorch/blob/main/voicebox_pytorch/voicebox_pytorch.py#L203

	class DepthwiseConv(Module):
	def __init__(
	self,
	dim,
	*,
	kernel_size,
	groups = None
	):
	super().__init__()
	assert not divisible_by(kernel_size, 2)
	groups = default(groups, dim) # full depthwise conv by default

	self.dw_conv1d = nn.Sequential(
	nn.Conv1d(dim, dim, kernel_size, groups = groups, padding = kernel_size // 2),
	nn.SiLU()
	)

	def forward(
	self,
	x,
	mask = None
	):

	if exists(mask):
	x = einx.where('b n, b n d, -> b n d', mask, x, 0.)

	x = rearrange(x, 'b n c -> b c n')
	x = self.dw_conv1d(x)
	out = rearrange(x, 'b c n -> b n c')

	if exists(mask):
	out = einx.where('b n, b n d, -> b n d', mask, out, 0.)

	return out

	# adaln zero from DiT paper

	class AdaLNZero(Module):
	def __init__(
	self,
	dim,
	dim_condition = None,
	init_bias_value = -2.
	):
	super().__init__()
	dim_condition = default(dim_condition, dim)
	self.to_gamma = nn.Linear(dim_condition, dim)

	nn.init.zeros_(self.to_gamma.weight)
	nn.init.constant_(self.to_gamma.bias, init_bias_value)

	def forward(self, x, *, condition):
	if condition.ndim == 2:
	condition = rearrange(condition, 'b d -> b 1 d')

	gamma = self.to_gamma(condition).sigmoid()
	return x * gamma

	# random projection fourier embedding

	class RandomFourierEmbed(Module):
	def __init__(self, dim):
	super().__init__()
	assert divisible_by(dim, 2)
	self.register_buffer('weights', torch.randn(dim // 2))

	def forward(self, x):
	freqs = einx.multiply('i, j -> i j', x, self.weights) * 2 * torch.pi
	fourier_embed, _ = pack((x, freqs.sin(), freqs.cos()), 'b *')
	return fourier_embed

	# character embedding

	class CharacterEmbed(Module):
	def __init__(
	self,
	dim,
	num_embeds = 256,
	):
	super().__init__()
	self.dim = dim
	self.embed = nn.Embedding(num_embeds + 1, dim) # will just use 0 as the 'filler token'

	def forward(
	self,
	text: Int['b nt'],
	max_seq_len: int,
	**kwargs
	) -> Float['b n d']:

	text = text + 1 # shift all other token ids up by 1 and use 0 as filler token

	text = text[:, :max_seq_len] # just curtail if character tokens are more than the mel spec tokens, one of the edge cases the paper did not address
	text = pad_to_length(text, max_seq_len, value = 0)

	return self.embed(text)

	class InterpolatedCharacterEmbed(Module):
	def __init__(
	self,
	dim,
	num_embeds = 256,
	):
	super().__init__()
	self.dim = dim
	self.embed = nn.Embedding(num_embeds, dim)

	self.abs_pos_mlp = Sequential(
	Rearrange('... -> ... 1'),
	Linear(1, dim),
	nn.SiLU(),
	Linear(dim, dim)
	)

	def forward(
	self,
	text: Int['b nt'],
	max_seq_len: int,
	mask: Bool['b n'] \| None = None
	) -> Float['b n d']:

	device = text.device

	mask = default(mask, (None,))

	interp_embeds = []
	interp_abs_positions = []

	for one_text, one_mask in zip_longest(text, mask):

	valid_text = one_text >= 0
	one_text = one_text[valid_text]
	one_text_embed = self.embed(one_text)

	# save the absolute positions

	text_seq_len = one_text.shape[0]

	# determine audio sequence length from mask

	audio_seq_len = max_seq_len
	if exists(one_mask):
	audio_seq_len = one_mask.sum().long().item()

	# interpolate text embedding to audio embedding length

	interp_text_embed = interpolate_1d(one_text_embed, audio_seq_len)
	interp_abs_pos = torch.linspace(0, text_seq_len, audio_seq_len, device = device)

	interp_embeds.append(interp_text_embed)
	interp_abs_positions.append(interp_abs_pos)

	interp_embeds = pad_sequence(interp_embeds)
	interp_abs_positions = pad_sequence(interp_abs_positions)

	interp_embeds = F.pad(interp_embeds, (0, 0, 0, max_seq_len - interp_embeds.shape[-2]))
	interp_abs_positions = pad_to_length(interp_abs_positions, max_seq_len)

	# pass interp absolute positions through mlp for implicit positions

	interp_embeds = interp_embeds + self.abs_pos_mlp(interp_abs_positions)

	if exists(mask):
	interp_embeds = einx.where('b n, b n d, -> b n d', mask, interp_embeds, 0.)

	return interp_embeds

	# text audio cross conditioning in multistream setup

	class TextAudioCrossCondition(Module):
	def __init__(
	self,
	dim,
	dim_text,
	dim_frames,
	cond_audio_to_text = True,
	):
	super().__init__()
	#self.text_to_audio = nn.Linear(dim_text + dim, dim, bias = False)
	self.text_frames_to_audio = nn.Linear(dim + dim_text + dim_frames, dim, bias = False)
	nn.init.zeros_(self.text_frames_to_audio.weight)

	self.cond_audio_to_text = cond_audio_to_text

	if cond_audio_to_text:
	self.audio_to_text = nn.Linear(dim + dim_text, dim_text, bias = False)
	nn.init.zeros_(self.audio_to_text.weight)

	self.audio_to_frames = nn.Linear(dim + dim_frames, dim_frames, bias = False)
	nn.init.zeros_(self.audio_to_frames.weight)

	def forward(
	self,
	audio: Float['b n d'],
	text: Float['b n dt'],
	frames: Float['b n df'],
	):
	#audio_text, _ = pack((audio, text), 'b n *')
	audio_text_frames, _ = pack((audio, text, frames), 'b n *')
	audio_text, _ = pack((audio, text), 'b n *')
	audio_frames, _ = pack((audio, frames), 'b n *')

	#text_cond = self.text_to_audio(audio_text)
	text_cond = self.text_frames_to_audio(audio_text_frames)
	audio_cond = self.audio_to_text(audio_text) if self.cond_audio_to_text else 0.
	audio_cond2 = self.audio_to_frames(audio_frames) if self.cond_audio_to_text else 0.

	return audio + text_cond, text + audio_cond, frames + audio_cond2

	# attention and transformer backbone
	# for use in both e2tts as well as duration module

	class Transformer(Module):
	@beartype
	def __init__(
	self,
	*,
	dim,
	dim_text = None, # will default to half of audio dimension
	dim_frames = 512,
	depth = 8,
	heads = 8,
	dim_head = 64,
	ff_mult = 4,
	text_depth = None,
	text_heads = None,
	text_dim_head = None,
	text_ff_mult = None,
	cond_on_time = True,
	abs_pos_emb = True,
	max_seq_len = 8192,
	kernel_size = 31,
	dropout = 0.1,
	num_registers = 32,
	attn_kwargs: dict = dict(
	gate_value_heads = True,
	softclamp_logits = True,
	),
	ff_kwargs: dict = dict(),
	if_text_modules = True,
	if_cross_attn = True,
	if_audio_conv = True,
	if_text_conv = False
	):
	super().__init__()
	assert divisible_by(depth, 2), 'depth needs to be even'

	# absolute positional embedding

	self.max_seq_len = max_seq_len
	self.abs_pos_emb = nn.Embedding(max_seq_len, dim) if abs_pos_emb else None

	self.dim = dim

	dim_text = default(dim_text, dim // 2)
	self.dim_text = dim_text

	self.dim_frames = dim_frames

	text_heads = default(text_heads, heads)
	text_dim_head = default(text_dim_head, dim_head)
	text_ff_mult = default(text_ff_mult, ff_mult)
	text_depth = default(text_depth, depth)

	assert 1 <= text_depth <= depth, 'must have at least 1 layer of text conditioning, but less than total number of speech layers'

	self.depth = depth
	self.layers = ModuleList([])

	# registers

	self.num_registers = num_registers
	self.registers = nn.Parameter(torch.zeros(num_registers, dim))
	nn.init.normal_(self.registers, std = 0.02)

	if if_text_modules:
	self.text_registers = nn.Parameter(torch.zeros(num_registers, dim_text))
	nn.init.normal_(self.text_registers, std = 0.02)

	self.frames_registers = nn.Parameter(torch.zeros(num_registers, dim_frames))
	nn.init.normal_(self.frames_registers, std = 0.02)

	# rotary embedding

	self.rotary_emb = RotaryEmbedding(dim_head)
	self.text_rotary_emb = RotaryEmbedding(dim_head)
	self.frames_rotary_emb = RotaryEmbedding(dim_head)

	# time conditioning
	# will use adaptive rmsnorm

	self.cond_on_time = cond_on_time
	rmsnorm_klass = RMSNorm if not cond_on_time else AdaptiveRMSNorm
	postbranch_klass = Identity if not cond_on_time else partial(AdaLNZero, dim = dim)

	self.time_cond_mlp = Identity()

	if cond_on_time:
	self.time_cond_mlp = Sequential(
	RandomFourierEmbed(dim),
	Linear(dim + 1, dim),
	nn.SiLU()
	)

	for ind in range(depth):
	is_later_half = ind >= (depth // 2)
	has_text = ind < text_depth

	# speech related
	if if_audio_conv:
	speech_conv = DepthwiseConv(dim, kernel_size = kernel_size)

	attn_norm = rmsnorm_klass(dim)
	attn = Attention(dim = dim, heads = heads, dim_head = dim_head, dropout = dropout, **attn_kwargs)
	attn_adaln_zero = postbranch_klass()

	if if_cross_attn:
	attn_norm2 = rmsnorm_klass(dim)
	attn2 = Attention(dim = dim, heads = heads, dim_head = dim_head, dropout = dropout, **attn_kwargs)
	attn_adaln_zero2 = postbranch_klass()

	ff_norm = rmsnorm_klass(dim)
	ff = FeedForward(dim = dim, glu = True, mult = ff_mult, dropout = dropout, **ff_kwargs)
	ff_adaln_zero = postbranch_klass()

	skip_proj = Linear(dim * 2, dim, bias = False) if is_later_half else None

	if if_cross_attn:
	if if_audio_conv:
	speech_modules = ModuleList([
	skip_proj,
	speech_conv,
	attn_norm,
	attn,
	attn_adaln_zero,
	attn_norm2,
	attn2,
	attn_adaln_zero2,
	ff_norm,
	ff,
	ff_adaln_zero,
	])
	else:
	speech_modules = ModuleList([
	skip_proj,
	attn_norm,
	attn,
	attn_adaln_zero,
	attn_norm2,
	attn2,
	attn_adaln_zero2,
	ff_norm,
	ff,
	ff_adaln_zero,
	])
	else:
	if if_audio_conv:
	speech_modules = ModuleList([
	skip_proj,
	speech_conv,
	attn_norm,
	attn,
	attn_adaln_zero,
	ff_norm,
	ff,
	ff_adaln_zero,
	])
	else:
	speech_modules = ModuleList([
	skip_proj,
	attn_norm,
	attn,
	attn_adaln_zero,
	ff_norm,
	ff,
	ff_adaln_zero,
	])

	text_modules = None

	if has_text and if_text_modules:
	# text related
	if if_text_conv:
	text_conv = DepthwiseConv(dim_text, kernel_size = kernel_size)

	text_attn_norm = RMSNorm(dim_text)
	text_attn = Attention(dim = dim_text, heads = text_heads, dim_head = text_dim_head, dropout = dropout, **attn_kwargs)

	text_ff_norm = RMSNorm(dim_text)
	text_ff = FeedForward(dim = dim_text, glu = True, mult = text_ff_mult, dropout = dropout, **ff_kwargs)

	# cross condition

	is_last = ind == (text_depth - 1)

	cross_condition = TextAudioCrossCondition(dim = dim, dim_text = dim_text, dim_frames = dim_frames, cond_audio_to_text = not is_last)

	if if_text_conv:
	text_modules = ModuleList([
	text_conv,
	text_attn_norm,
	text_attn,
	text_ff_norm,
	text_ff,
	cross_condition
	])
	else:
	text_modules = ModuleList([
	text_attn_norm,
	text_attn,
	text_ff_norm,
	text_ff,
	cross_condition
	])

	if True:
	frames_conv = DepthwiseConv(dim_frames, kernel_size = kernel_size)

	frames_attn_norm = RMSNorm(dim_frames)
	frames_attn = Attention(dim = dim_frames, heads = 8, dim_head = 64, dropout = dropout, **attn_kwargs)

	frames_ff_norm = RMSNorm(dim_frames)
	frames_ff = FeedForward(dim = dim_frames, glu = True, mult = 4, dropout = dropout, **ff_kwargs)

	# cross condition

	frames_modules = ModuleList([
	frames_conv,
	frames_attn_norm,
	frames_attn,
	frames_ff_norm,
	frames_ff
	])

	self.layers.append(ModuleList([
	speech_modules,
	text_modules,
	frames_modules
	]))

	self.final_norm = RMSNorm(dim)

	self.if_cross_attn = if_cross_attn
	self.if_audio_conv = if_audio_conv
	self.if_text_conv = if_text_conv

	def forward(
	self,
	x: Float['b n d'],
	times: Float['b'] \| Float[''] \| None = None,
	mask: Bool['b n'] \| None = None,
	text_embed: Float['b n dt'] \| None = None,
	frames_embed: Float['b n df'] \| None = None,
	context: Float['b nc dc'] \| None = None,
	context_mask: Float['b nc'] \| None = None
	):
	batch, seq_len, device = *x.shape[:2], x.device

	assert not (exists(times) ^ self.cond_on_time), '`times` must be passed in if `cond_on_time` is set to `True` and vice versa'

	# handle absolute positions if needed

	if exists(self.abs_pos_emb):
	assert seq_len <= self.max_seq_len, f'{seq_len} exceeds the set `max_seq_len` ({self.max_seq_len}) on Transformer'
	seq = torch.arange(seq_len, device = device)
	x = x + self.abs_pos_emb(seq)

	# handle adaptive rmsnorm kwargs

	norm_kwargs = dict()

	if exists(times):
	if times.ndim == 0:
	times = repeat(times, ' -> b', b = batch)

	times = self.time_cond_mlp(times)
	norm_kwargs.update(condition = times)

	# register tokens

	registers = repeat(self.registers, 'r d -> b r d', b = batch)
	x, registers_packed_shape = pack((registers, x), 'b * d')

	if exists(mask):
	mask = F.pad(mask, (self.num_registers, 0), value = True)

	# rotary embedding

	rotary_pos_emb = self.rotary_emb.forward_from_seq_len(x.shape[-2])

	# text related

	if exists(text_embed):
	text_rotary_pos_emb = self.text_rotary_emb.forward_from_seq_len(x.shape[-2])

	text_registers = repeat(self.text_registers, 'r d -> b r d', b = batch)
	text_embed, _ = pack((text_registers, text_embed), 'b * d')

	if exists(frames_embed):
	frames_rotary_pos_emb = self.frames_rotary_emb.forward_from_seq_len(x.shape[-2])

	frames_registers = repeat(self.frames_registers, 'r d -> b r d', b = batch)
	frames_embed, _ = pack((frames_registers, frames_embed), 'b * d')

	# skip connection related stuff

	skips = []

	# go through the layers

	for ind, (speech_modules, text_modules, frames_modules) in enumerate(self.layers):
	layer = ind + 1

	if self.if_cross_attn:
	if self.if_audio_conv:
	(
	maybe_skip_proj,
	speech_conv,
	attn_norm,
	attn,
	maybe_attn_adaln_zero,
	attn_norm2,
	attn2,
	maybe_attn_adaln_zero2,
	ff_norm,
	ff,
	maybe_ff_adaln_zero
	) = speech_modules
	else:
	(
	maybe_skip_proj,
	attn_norm,
	attn,
	maybe_attn_adaln_zero,
	attn_norm2,
	attn2,
	maybe_attn_adaln_zero2,
	ff_norm,
	ff,
	maybe_ff_adaln_zero
	) = speech_modules
	else:
	if self.if_audio_conv:
	(
	maybe_skip_proj,
	speech_conv,
	attn_norm,
	attn,
	maybe_attn_adaln_zero,
	ff_norm,
	ff,
	maybe_ff_adaln_zero
	) = speech_modules
	else:
	(
	maybe_skip_proj,
	attn_norm,
	attn,
	maybe_attn_adaln_zero,
	ff_norm,
	ff,
	maybe_ff_adaln_zero
	) = speech_modules

	# smaller text transformer

	if exists(text_embed) and exists(text_modules):

	if self.if_text_conv:
	(
	text_conv,
	text_attn_norm,
	text_attn,
	text_ff_norm,
	text_ff,
	cross_condition
	) = text_modules
	else:
	(
	text_attn_norm,
	text_attn,
	text_ff_norm,
	text_ff,
	cross_condition
	) = text_modules

	if self.if_text_conv:
	text_embed = text_conv(text_embed, mask = mask) + text_embed

	text_embed = text_attn(text_attn_norm(text_embed), rotary_pos_emb = text_rotary_pos_emb, mask = mask) + text_embed

	text_embed = text_ff(text_ff_norm(text_embed)) + text_embed

	# frames transformer
	(
	frames_conv,
	frames_attn_norm,
	frames_attn,
	frames_ff_norm,
	frames_ff
	) = frames_modules

	frames_embed = frames_conv(frames_embed, mask = mask) + frames_embed

	frames_embed = frames_attn(frames_attn_norm(frames_embed), rotary_pos_emb = frames_rotary_pos_emb, mask = mask) + frames_embed

	frames_embed = frames_ff(frames_ff_norm(frames_embed)) + frames_embed

	# cross condition
	x, text_embed, frames_embed = cross_condition(x, text_embed, frames_embed)

	# skip connection logic

	is_first_half = layer <= (self.depth // 2)
	is_later_half = not is_first_half

	if is_first_half:
	skips.append(x)

	if is_later_half:
	skip = skips.pop()
	x = torch.cat((x, skip), dim = -1)
	x = maybe_skip_proj(x)

	# position generating convolution

	if self.if_audio_conv:
	x = speech_conv(x, mask = mask) + x

	# attention and feedforward blocks

	attn_out = attn(attn_norm(x, **norm_kwargs), rotary_pos_emb = rotary_pos_emb, mask = mask)

	x = x + maybe_attn_adaln_zero(attn_out, **norm_kwargs)

	if self.if_cross_attn:
	attn_out = attn2(attn_norm2(x, **norm_kwargs), rotary_pos_emb = rotary_pos_emb, mask = mask, context = context, context_mask = context_mask)

	x = x + maybe_attn_adaln_zero2(attn_out, **norm_kwargs)

	ff_out = ff(ff_norm(x, **norm_kwargs))

	x = x + maybe_ff_adaln_zero(ff_out, **norm_kwargs)

	assert len(skips) == 0

	_, x = unpack(x, registers_packed_shape, 'b * d')

	return self.final_norm(x)

	# main classes

	class DurationPredictor(Module):
	@beartype
	def __init__(
	self,
	transformer: dict \| Transformer,
	num_channels = None,
	mel_spec_kwargs: dict = dict(),
	char_embed_kwargs: dict = dict(),
	text_num_embeds = None,
	tokenizer: (
	Literal['char_utf8', 'phoneme_en'] \|
	Callable[[list[str]], Int['b nt']]
	) = 'char_utf8'
	):
	super().__init__()

	if isinstance(transformer, dict):
	transformer = Transformer(
	**transformer,
	cond_on_time = False
	)

	# mel spec

	self.mel_spec = MelSpec(**mel_spec_kwargs)
	self.num_channels = default(num_channels, self.mel_spec.n_mel_channels)

	self.transformer = transformer

	dim = transformer.dim
	dim_text = transformer.dim_text

	self.dim = dim

	self.proj_in = Linear(self.num_channels, self.dim)

	# tokenizer and text embed

	if callable(tokenizer):
	assert exists(text_num_embeds), '`text_num_embeds` must be given if supplying your own tokenizer encode function'
	self.tokenizer = tokenizer
	elif tokenizer == 'char_utf8':
	text_num_embeds = 256
	self.tokenizer = list_str_to_tensor
	elif tokenizer == 'phoneme_en':
	self.tokenizer, text_num_embeds = get_g2p_en_encode()
	elif tokenizer == 'phoneme_zh':
	self.tokenizer, text_num_embeds = get_g2p_zh_encode()
	else:
	raise ValueError(f'unknown tokenizer string {tokenizer}')

	self.embed_text = CharacterEmbed(dim_text, num_embeds = text_num_embeds, **char_embed_kwargs)

	# to prediction

	self.to_pred = Sequential(
	Linear(dim, 1, bias = False),
	nn.Softplus(),
	Rearrange('... 1 -> ...')
	)

	def forward(
	self,
	x: Float['b n d'] \| Float['b nw'],
	*,
	text: Int['b nt'] \| list[str] \| None = None,
	lens: Int['b'] \| None = None,
	return_loss = True
	):
	# raw wave

	if x.ndim == 2:
	x = self.mel_spec(x)
	x = rearrange(x, 'b d n -> b n d')
	assert x.shape[-1] == self.dim

	x = self.proj_in(x)

	batch, seq_len, device = *x.shape[:2], x.device

	# text

	text_embed = None

	if exists(text):
	if isinstance(text, list):
	text = list_str_to_tensor(text).to(device)
	assert text.shape[0] == batch

	text_embed = self.embed_text(text, seq_len)

	# handle lengths (duration)

	if not exists(lens):
	lens = torch.full((batch,), seq_len, device = device)

	mask = lens_to_mask(lens, length = seq_len)

	# if returning a loss, mask out randomly from an index and have it predict the duration

	if return_loss:
	rand_frac_index = x.new_zeros(batch).uniform_(0, 1)
	rand_index = (rand_frac_index * lens).long()

	seq = torch.arange(seq_len, device = device)
	mask &= einx.less('n, b -> b n', seq, rand_index)

	# attending

	x = self.transformer(
	x,
	mask = mask,
	text_embed = text_embed,
	)

	x = maybe_masked_mean(x, mask)

	pred = self.to_pred(x)

	# return the prediction if not returning loss

	if not return_loss:
	return pred

	# loss

	return F.mse_loss(pred, lens.float())

	class E2TTS(Module):

	@beartype
	def __init__(
	self,
	transformer: dict \| Transformer = None,
	duration_predictor: dict \| DurationPredictor \| None = None,
	odeint_kwargs: dict = dict(
	#atol = 1e-5,
	#rtol = 1e-5,
	#method = 'midpoint'
	method = "euler"
	),
	audiocond_drop_prob = 0.30,
	cond_drop_prob = 0.20,
	prompt_drop_prob = 0.10,
	num_channels = None,
	mel_spec_module: Module \| None = None,
	char_embed_kwargs: dict = dict(),
	mel_spec_kwargs: dict = dict(),
	frac_lengths_mask: tuple[float, float] = (0.7, 1.),
	audiocond_snr: tuple[float, float] \| None = None,
	concat_cond = False,
	interpolated_text = False,
	text_num_embeds: int \| None = None,
	tokenizer: (
	Literal['char_utf8', 'phoneme_en', 'phoneme_zh'] \|
	Callable[[list[str]], Int['b nt']]
	) = 'char_utf8',
	use_vocos = True,
	pretrained_vocos_path = 'charactr/vocos-mel-24khz',
	sampling_rate: int \| None = None,
	frame_size: int = 320,
	#### dpo
	velocity_consistency_weight = -1e-5,
	#### dpo

	if_cond_proj_in = True,
	cond_proj_in_bias = True,
	if_embed_text = True,
	if_text_encoder2 = True,
	if_clip_encoder = False,
	video_encoder = "clip_vit"
	):
	super().__init__()

	if isinstance(transformer, dict):
	transformer = Transformer(
	**transformer,
	cond_on_time = True
	)

	if isinstance(duration_predictor, dict):
	duration_predictor = DurationPredictor(**duration_predictor)

	self.transformer = transformer

	dim = transformer.dim
	dim_text = transformer.dim_text
	dim_frames = transformer.dim_frames

	self.dim = dim
	self.dim_text = dim_text

	self.frac_lengths_mask = frac_lengths_mask
	self.audiocond_snr = audiocond_snr

	self.duration_predictor = duration_predictor

	# sampling

	self.odeint_kwargs = odeint_kwargs

	# mel spec

	self.mel_spec = default(mel_spec_module, None)
	num_channels = default(num_channels, None)

	self.num_channels = num_channels
	self.sampling_rate = default(sampling_rate, None)
	self.frame_size = frame_size

	# whether to concat condition and project rather than project both and sum

	self.concat_cond = concat_cond

	if concat_cond:
	self.proj_in = nn.Linear(num_channels * 2, dim)
	else:
	self.proj_in = nn.Linear(num_channels, dim)
	self.cond_proj_in = nn.Linear(num_channels, dim, bias=cond_proj_in_bias) if if_cond_proj_in else None
	#self.cond_proj_in = nn.Linear(NOTES, dim, bias=cond_proj_in_bias) if if_cond_proj_in else None

	# to prediction

	self.to_pred = Linear(dim, num_channels)

	# tokenizer and text embed

	if callable(tokenizer):
	assert exists(text_num_embeds), '`text_num_embeds` must be given if supplying your own tokenizer encode function'
	self.tokenizer = tokenizer
	elif tokenizer == 'char_utf8':
	text_num_embeds = 256
	self.tokenizer = list_str_to_tensor
	elif tokenizer == 'phoneme_en':
	self.tokenizer, text_num_embeds = get_g2p_en_encode()
	elif tokenizer == 'phoneme_zh':
	self.tokenizer, text_num_embeds = get_g2p_zh_encode()
	else:
	raise ValueError(f'unknown tokenizer string {tokenizer}')

	self.audiocond_drop_prob = audiocond_drop_prob
	self.cond_drop_prob = cond_drop_prob
	self.prompt_drop_prob = prompt_drop_prob

	# text embedding

	text_embed_klass = CharacterEmbed if not interpolated_text else InterpolatedCharacterEmbed

	self.embed_text = text_embed_klass(dim_text, num_embeds = text_num_embeds, **char_embed_kwargs) if if_embed_text else None

	# weight for velocity consistency

	self.register_buffer('zero', torch.tensor(0.), persistent = False)
	self.velocity_consistency_weight = velocity_consistency_weight

	# default vocos for mel -> audio

	#if pretrained_vocos_path == 'charactr/vocos-mel-24khz':
	# self.vocos = Vocos.from_pretrained(pretrained_vocos_path) if use_vocos else None
	#elif pretrained_vocos_path == 'facebook/encodec_24khz':
	# self.vocos = EncodecWrapper("facebook/encodec_24khz") if use_vocos else None
	#elif pretrained_vocos_path == 'vae':
	# self.vocos = VaeWrapper() if use_vocos else None

	if if_text_encoder2:
	self.tokenizer2 = AutoTokenizer.from_pretrained("./ckpts/flan-t5-large")
	self.text_encoder2 = T5EncoderModel.from_pretrained("./ckpts/flan-t5-large")
	for param in self.text_encoder2.parameters():
	param.requires_grad = False
	self.text_encoder2.eval()

	self.proj_text = None
	self.proj_frames = Linear(NOTES, dim_frames)
	if if_clip_encoder:
	if video_encoder == "clip_vit":
	####pass
	self.image_processor = CLIPImageProcessor()
	#self.image_encoder = CLIPVisionModelWithProjection.from_pretrained("/ailab-train2/speech/zhanghaomin/models/IP-Adapter/", subfolder="models/image_encoder")
	self.image_encoder = CLIPVisionModelWithProjection.from_pretrained("./ckpts/IP-Adapter/", subfolder="sdxl_models/image_encoder")
	elif video_encoder == "clip_vit2":
	self.image_processor = AutoProcessor.from_pretrained("/ailab-train2/speech/zhanghaomin/models/clip-vit-large-patch14-336/")
	self.image_encoder = CLIPVisionModelWithProjection.from_pretrained("/ailab-train2/speech/zhanghaomin/models/clip-vit-large-patch14-336/")
	elif video_encoder == "clip_convnext":
	self.image_encoder, _, self.image_processor = open_clip.create_model_and_transforms("hf-hub:laion/CLIP-convnext_xxlarge-laion2B-s34B-b82K-augreg-soup")
	elif video_encoder == "dinov2":
	self.image_processor = AutoImageProcessor.from_pretrained("/ailab-train2/speech/zhanghaomin/models/dinov2-giant/")
	self.image_encoder = AutoModel.from_pretrained("/ailab-train2/speech/zhanghaomin/models/dinov2-giant/")
	elif video_encoder == "mixed":
	pass
	#self.image_processor1 = CLIPImageProcessor()
	#self.image_encoder1 = CLIPVisionModelWithProjection.from_pretrained("/ailab-train2/speech/zhanghaomin/models/IP-Adapter/", subfolder="sdxl_models/image_encoder")
	#self.image_processor2 = AutoProcessor.from_pretrained("/ailab-train2/speech/zhanghaomin/models/clip-vit-large-patch14-336/")
	#self.image_encoder2 = CLIPVisionModelWithProjection.from_pretrained("/ailab-train2/speech/zhanghaomin/models/clip-vit-large-patch14-336/")
	#self.image_encoder3, _, self.image_processor3 = open_clip.create_model_and_transforms("hf-hub:laion/CLIP-convnext_xxlarge-laion2B-s34B-b82K-augreg-soup")
	#self.image_processor4 = AutoImageProcessor.from_pretrained("/ailab-train2/speech/zhanghaomin/models/dinov2-giant/")
	#self.image_encoder4 = AutoModel.from_pretrained("/ailab-train2/speech/zhanghaomin/models/dinov2-giant/")
	else:
	self.image_processor = None
	self.image_encoder = None
	if video_encoder != "mixed":
	####pass
	for param in self.image_encoder.parameters():
	param.requires_grad = False
	self.image_encoder.eval()
	else:
	#for param in self.image_encoder1.parameters():
	# param.requires_grad = False
	#self.image_encoder1.eval()
	#for param in self.image_encoder2.parameters():
	# param.requires_grad = False
	#self.image_encoder2.eval()
	#for param in self.image_encoder3.parameters():
	# param.requires_grad = False
	#self.image_encoder3.eval()
	#for param in self.image_encoder4.parameters():
	# param.requires_grad = False
	#self.image_encoder4.eval()
	self.dim_text_raw = 4608
	self.proj_text = Linear(self.dim_text_raw, dim_text)
	self.video_encoder = video_encoder

	#for param in self.vocos.parameters():
	# param.requires_grad = False
	#self.vocos.eval()

	########self.conv1 = nn.Conv3d(6, 64, kernel_size=(3, 3, 3), padding=(1, 1, 1))
	########self.pool1 = nn.Conv3d(64, 64, kernel_size=(1, 2, 2), stride=(1, 2, 2))
	####self.conv1 = nn.Conv3d(3, 16, kernel_size=(3, 3, 3), padding=(1, 1, 1))
	####self.pool1 = nn.Conv3d(16, 16, kernel_size=(1, 2, 2), stride=(1, 2, 2), padding=(0,0,0))
	####
	########self.conv2 = nn.Conv3d(64, 128, kernel_size=(3, 3, 3), padding=(1, 1, 1))
	########self.pool2 = nn.Conv3d(128, 128, kernel_size=(2, 2, 2), stride=(2, 2, 2))
	####self.conv2 = nn.Conv3d(16, 32, kernel_size=(3, 3, 3), padding=(1, 1, 1))
	####self.pool2 = nn.Conv3d(32, 32, kernel_size=(2, 2, 2), stride=(1, 2, 2), padding=(0,0,0))
	####
	########self.conv3a = nn.Conv3d(128, 256, kernel_size=(3, 3, 3), padding=(1, 1, 1))
	########self.conv3b = nn.Conv3d(256, 256, kernel_size=(3, 3, 3), padding=(1, 1, 1))
	########self.pool3 = nn.Conv3d(256, 256, kernel_size=(1, 2, 2), stride=(1, 2, 2))
	####self.conv3a = nn.Conv3d(32, 64, kernel_size=(3, 3, 3), padding=(1, 1, 1))
	####self.conv3b = nn.Conv3d(64, 64, kernel_size=(3, 3, 3), padding=(1, 1, 1))
	####self.pool3 = nn.Conv3d(64, 64, kernel_size=(1, 2, 2), stride=(1, 2, 2), padding=(0,0,0))
	####
	########self.conv4a = nn.Conv3d(256, 512, kernel_size=(3, 3, 3), padding=(1, 1, 1))
	########self.conv4b = nn.Conv3d(512, 512, kernel_size=(3, 3, 3), padding=(1, 1, 1))
	########self.pool4 = nn.Conv3d(512, 512, kernel_size=(1, 2, 2), stride=(1, 2, 2))
	#####self.conv4a = nn.Conv3d(64, 64, kernel_size=(3, 3, 3), padding=(1, 1, 1))
	#####self.conv4b = nn.Conv3d(64, 64, kernel_size=(3, 3, 3), padding=(1, 1, 1))
	######self.pool4 = nn.Conv3d(64, 32, kernel_size=(1, 2, 2), stride=(1, 2, 2), padding=(0,0,0))
	#####self.pool4 = nn.ConvTranspose3d(64, 32, kernel_size=(3, 3, 3), padding=(1, 1, 1))
	####
	########self.conv5a = nn.Conv3d(512, 512, kernel_size=(3, 3, 3), padding=(1, 1, 1))
	########self.conv5b = nn.Conv3d(512, 512, kernel_size=(3, 3, 3), padding=(1, 1, 1))
	########self.pool5 = nn.ConvTranspose3d(512, 128, kernel_size=(5, 3, 3), stride=(5, 1, 1), padding=(0, 1, 1))
	####self.conv5a = nn.Conv3d(64, 64, kernel_size=(3, 3, 3), padding=(1, 1, 1))
	####self.conv5b = nn.Conv3d(64, 64, kernel_size=(3, 3, 3), padding=(1, 1, 1))
	####self.pool5 = nn.ConvTranspose3d(64, 32, kernel_size=(3, 3, 3), stride=(3, 1, 1), padding=(0, 1, 1))
	#####self.pool5 = nn.Conv3d(256, 256, kernel_size=(1, 2, 2), stride=(1, 2, 2))
	####
	####self.relu = nn.ReLU()
	####self.final_activation = nn.Sigmoid()
	####self.dropout = nn.Dropout(p=0.50)
	########self.fc5 = nn.Linear(51200, NOTES)
	####self.fc5 = nn.Linear(65536, 208)
	####self.fc6 = nn.Linear(208, NOTES)
	####
	#####self.rnn = nn.RNN(NOTES, NOTES, 1)
	#####self.fc7 = nn.Linear(NOTES, NOTES)
	####
	#####self.bn1 = nn.BatchNorm3d(16)
	#####self.bn2 = nn.BatchNorm3d(32)
	#####self.bn3 = nn.BatchNorm3d(64)
	#####self.bn4 = nn.BatchNorm3d(64)
	#####self.bn5 = nn.BatchNorm3d(64)
	#####self.bn6 = nn.BatchNorm3d(64)
	#####self.bn7 = nn.BatchNorm1d(208)

	self.video2roll_net = Video2RollNet.resnet18(num_classes=NOTES)

	def encode_frames(self, x, l):
	#print("x input", x.shape, l) # [1, 1, 251, 100, 900]
	b, c, t, w, h = x.shape
	assert(c == 1)

	x_all = []
	for i in range(t):
	frames = []
	for j in [-2, -1, 0, 1, 2]:
	f = min(max(i+j, 0), t-1)
	frames.append(x[:,:,f:f+1,:,:])
	frames = torch.cat(frames, dim=2) # [b, 1, 5, w, h]
	x_all.append(frames)
	x = torch.cat(x_all, dim=1).reshape(bt, 5, w, h) # [bt, 5, w, h]
	#print("x", x.shape, l) # [251, 5, 100, 900]
	x = self.video2roll_net(x)
	x = nn.Sigmoid()(x)
	#print("x output", x.shape) # [251, 51]

	####video_multi
	####x = x.reshape(b, t, 1, NOTES).repeat(1,1,3,1).reshape(b, t*3, NOTES)
	t5 = (t5//2)2
	x = x.reshape(b, t, 1, NOTES).repeat(1,1,5,1).reshape(b, t*5, NOTES)[:,:t5,:].reshape(b, t5//2, 2, NOTES).mean(2)
	b, d, _ = x.shape

	#print("encode_frames", x.shape, l)
	if d > l:
	x = x[:,:l,:]
	elif d < l:
	x = torch.cat((x, torch.zeros(b,l-d,NOTES,device=x.device)), 1)
	return x

	return x

	####def encode_frames(self, x, l):
	#### x = x[:,:3,:,...]
	#### #print("x", x.shape) # [2, 6, 301, 320, 320] # [2, 3, 251, 128, 1024]
	#### x = self.conv1(x)
	#### #x = self.bn1(x)
	#### x = self.relu(x)
	#### x = self.dropout(x)
	#### #print("conv1", x.shape) # [2, 64, 301, 320, 320] # [2, 16, 251, 128, 1024]
	#### x = self.pool1(x)
	#### #print("pool1", x.shape) # [2, 64, 301, 160, 160] # [2, 16, 251, 64, 512]
	####
	#### x = self.conv2(x)
	#### #x = self.bn2(x)
	#### x = self.relu(x)
	#### x = self.dropout(x)
	#### #print("conv2", x.shape) # [2, 128, 301, 160, 160] # [2, 32, 250, 64, 512]
	#### x = self.pool2(x)
	#### #x = self.relu(x)
	#### #x = self.dropout(x)
	#### #print("pool2", x.shape) # [2, 128, 150, 80, 80] # [2, 32, 250, 32, 256]
	####
	#### x = self.conv3a(x)
	#### #x = self.bn3(x)
	#### x = self.relu(x)
	#### x = self.dropout(x)
	#### x = self.conv3b(x)
	#### #x = self.bn4(x)
	#### x = self.relu(x)
	#### x = self.dropout(x)
	#### #print("conv3", x.shape) # [2, 256, 150, 80, 80] # [2, 64, 250, 32, 256]
	#### x = self.pool3(x)
	#### #x = self.relu(x)
	#### #x = self.dropout(x)
	#### #print("pool3", x.shape) # [2, 256, 150, 40, 40] # [2, 64, 250, 16, 128]
	####
	#### #x = self.conv4a(x)
	#### #x = self.relu(x)
	#### ##x = self.dropout(x)
	#### #x = self.conv4b(x)
	#### #x = self.relu(x)
	#### ##x = self.dropout(x)
	#### ###print("conv4", x.shape) # [2, 512, 150, 40, 40] # [2, 64, 250, 16, 128]
	#### #x = self.pool4(x)
	#### ##print("pool4", x.shape) # [2, 512, 150, 20, 20] # [2, 32, 250, 8, 64]
	####
	#### x = self.conv5a(x)
	#### #x = self.bn5(x)
	#### x = self.relu(x)
	#### x = self.dropout(x)
	#### x = self.conv5b(x)
	#### #x = self.bn6(x)
	#### x = self.relu(x)
	#### x = self.dropout(x)
	#### #print("conv5", x.shape) # [2, 512, 150, 20, 20] # [2, 64, 250, 16, 128]
	#### x = self.pool5(x)
	#### #x = self.relu(x)
	#### #x = self.dropout(x)
	#### #print("pool5", x.shape) # [2, 128, 750, 20, 20] # [2, 32, 750/250, 16, 128]
	####
	#### b, c, d, w, h = x.shape
	#### x = x.permute(0,2,3,4,1).reshape(b,d,whc)
	#### x = self.fc5(x)
	#### #x = x.reshape(b,208,d)
	#### #x = self.bn7(x)
	#### #x = x.reshape(b,d,208)
	#### x = self.relu(x)
	#### x = self.dropout(x)
	#### x = self.fc6(x)
	####
	#### #x = self.relu(x)
	#### #x, _ = self.rnn(x)
	#### #x = self.fc7(x)
	####
	#### x = self.final_activation(x)
	####
	#### #x = x.reshape(b,d,1,NOTES).repeat(1,1,3,1).reshape(b,d*3,NOTES)
	#### #d = d * 3
	####
	#### #print("encode_frames", x.shape, l)
	#### if d > l:
	#### x = x[:,:l,:]
	#### elif d < l:
	#### x = torch.cat((x, torch.zeros(b,l-d,NOTES,device=x.device)), 1)
	#### return x

	@property
	def device(self):
	return next(self.parameters()).device

	def encode_text(self, prompt):
	device = self.device
	batch = self.tokenizer2(prompt, max_length=self.tokenizer2.model_max_length, padding=True, truncation=True, return_tensors="pt")
	input_ids, attention_mask = batch.input_ids.to(device), batch.attention_mask.to(device)

	with torch.no_grad():
	encoder_hidden_states = self.text_encoder2(input_ids=input_ids, attention_mask=attention_mask)[0]

	boolean_encoder_mask = (attention_mask == 1).to(device)
	return encoder_hidden_states, boolean_encoder_mask

	def encode_video(self, video_paths, l):
	if self.proj_text is None:
	d = self.dim_text
	else:
	d = self.dim_text_raw
	device = self.device
	b = 20
	with torch.no_grad():
	video_embeddings = []
	video_lens = []
	for video_path in video_paths:
	if video_path is None:
	video_embeddings.append(None)
	video_lens.append(0)
	continue
	if isinstance(video_path, tuple):
	video_path, start_sample, max_sample = video_path
	else:
	start_sample = 0
	max_sample = None
	if video_path.startswith("/ailab-train2/speech/zhanghaomin/VGGSound/"):
	if self.video_encoder == "clip_vit":
	feature_path = video_path.replace("/video/", "/feature/").replace(".mp4", ".npz")
	elif self.video_encoder == "clip_vit2":
	feature_path = video_path.replace("/video/", "/feature_clip_vit2/").replace(".mp4", ".npz")
	elif self.video_encoder == "clip_convnext":
	feature_path = video_path.replace("/video/", "/feature_clip_convnext/").replace(".mp4", ".npz")
	elif self.video_encoder == "dinov2":
	feature_path = video_path.replace("/video/", "/feature_dinov2/").replace(".mp4", ".npz")
	elif self.video_encoder == "mixed":
	feature_path = video_path.replace("/video/", "/feature_mixed/").replace(".mp4", ".npz")
	else:
	raise Exception("Invalid video_encoder " + self.video_encoder)
	else:
	if self.video_encoder == "clip_vit":
	feature_path = video_path.replace(".mp4", ".generated.npz")
	elif self.video_encoder == "clip_vit2":
	feature_path = video_path.replace(".mp4", ".generated.clip_vit2.npz")
	elif self.video_encoder == "clip_convnext":
	feature_path = video_path.replace(".mp4", ".generated.clip_convnext.npz")
	elif self.video_encoder == "dinov2":
	feature_path = video_path.replace(".mp4", ".generated.dinov2.npz")
	elif self.video_encoder == "mixed":
	feature_path = video_path.replace(".mp4", ".generated.mixed.npz")
	else:
	raise Exception("Invalid video_encoder " + self.video_encoder)

	if not os.path.exists(feature_path):
	#print("video not exist", video_path)
	frames, duration = read_frames_with_moviepy(video_path, max_frame_nums=None)
	if frames is None:
	video_embeddings.append(None)
	video_lens.append(0)
	continue
	if self.video_encoder in ["clip_vit", "clip_vit2", "dinov2"]:
	images = self.image_processor(images=frames, return_tensors="pt").to(device)
	#print("images", images["pixel_values"].shape, images["pixel_values"].max(), images["pixel_values"].min(), torch.abs(images["pixel_values"]).mean())
	elif self.video_encoder in ["clip_convnext"]:
	images = []
	for i in range(frames.shape[0]):
	images.append(self.image_processor(Image.fromarray(frames[i])).unsqueeze(0))
	images = torch.cat(images, dim=0).to(device)
	#print("images", images.shape, images.max(), images.min(), torch.abs(images).mean())
	elif self.video_encoder in ["mixed"]:
	#images1 = self.image_processor1(images=frames, return_tensors="pt").to(device)
	images2 = self.image_processor2(images=frames, return_tensors="pt").to(device)
	images4 = self.image_processor4(images=frames, return_tensors="pt").to(device)
	images3 = []
	for i in range(frames.shape[0]):
	images3.append(self.image_processor3(Image.fromarray(frames[i])).unsqueeze(0))
	images3 = torch.cat(images3, dim=0).to(device)
	else:
	raise Exception("Invalid video_encoder " + self.video_encoder)
	image_embeddings = []
	if self.video_encoder == "clip_vit":
	for i in range(math.ceil(images["pixel_values"].shape[0] / b)):
	image_embeddings.append(self.image_encoder(pixel_values=images["pixel_values"][ib: (i+1)b]).image_embeds.cpu())
	elif self.video_encoder == "clip_vit2":
	for i in range(math.ceil(images["pixel_values"].shape[0] / b)):
	image_embeddings.append(self.image_encoder(pixel_values=images["pixel_values"][ib: (i+1)b]).image_embeds.cpu())
	elif self.video_encoder == "clip_convnext":
	for i in range(math.ceil(images.shape[0] / b)):
	image_embeddings.append(self.image_encoder.encode_image(images[ib: (i+1)b]).cpu())
	elif self.video_encoder == "dinov2":
	for i in range(math.ceil(images["pixel_values"].shape[0] / b)):
	image_embeddings.append(self.image_encoder(pixel_values=images["pixel_values"][ib: (i+1)b]).pooler_output.cpu())
	elif self.video_encoder == "mixed":
	feature_path1 = feature_path.replace("/feature_mixed/", "/feature/")
	if not os.path.exists(feature_path1):
	image_embeddings1 = []
	for i in range(math.ceil(images1["pixel_values"].shape[0] / b)):
	image_embeddings1.append(self.image_encoder1(pixel_values=images1["pixel_values"][ib: (i+1)b]).image_embeds.cpu())
	image_embeddings1 = torch.cat(image_embeddings1, dim=0)
	#np.savez(feature_path1, image_embeddings1, duration)
	else:
	data1 = np.load(feature_path1)
	image_embeddings1 = torch.from_numpy(data1["arr_0"])
	feature_path2 = feature_path.replace("/feature_mixed/", "/feature_clip_vit2/")
	if not os.path.exists(feature_path2):
	image_embeddings2 = []
	for i in range(math.ceil(images2["pixel_values"].shape[0] / b)):
	image_embeddings2.append(self.image_encoder2(pixel_values=images2["pixel_values"][ib: (i+1)b]).image_embeds.cpu())
	image_embeddings2 = torch.cat(image_embeddings2, dim=0)
	np.savez(feature_path2, image_embeddings2, duration)
	else:
	data2 = np.load(feature_path2)
	image_embeddings2 = torch.from_numpy(data2["arr_0"])
	feature_path3 = feature_path.replace("/feature_mixed/", "/feature_clip_convnext/")
	if not os.path.exists(feature_path3):
	image_embeddings3 = []
	for i in range(math.ceil(images3.shape[0] / b)):
	image_embeddings3.append(self.image_encoder3.encode_image(images3[ib: (i+1)b]).cpu())
	image_embeddings3 = torch.cat(image_embeddings3, dim=0)
	np.savez(feature_path3, image_embeddings3, duration)
	else:
	data3 = np.load(feature_path3)
	image_embeddings3 = torch.from_numpy(data3["arr_0"])
	feature_path4 = feature_path.replace("/feature_mixed/", "/feature_dinov2/")
	if not os.path.exists(feature_path4):
	image_embeddings4 = []
	for i in range(math.ceil(images4["pixel_values"].shape[0] / b)):
	image_embeddings4.append(self.image_encoder4(pixel_values=images4["pixel_values"][ib: (i+1)b]).pooler_output.cpu())
	image_embeddings4 = torch.cat(image_embeddings4, dim=0)
	np.savez(feature_path4, image_embeddings4, duration)
	else:
	data4 = np.load(feature_path4)
	image_embeddings4 = torch.from_numpy(data4["arr_0"])
	mixed_l = min([image_embeddings1.shape[0], image_embeddings2.shape[0], image_embeddings3.shape[0], image_embeddings4.shape[0]])
	for i in range(mixed_l):
	image_embeddings.append(torch.cat([image_embeddings1[i:i+1,:], image_embeddings2[i:i+1,:], image_embeddings3[i:i+1,:], image_embeddings4[i:i+1,:]], dim=1))
	else:
	raise Exception("Invalid video_encoder " + self.video_encoder)
	image_embeddings = torch.cat(image_embeddings, dim=0)
	#print("image_embeddings", image_embeddings.shape, image_embeddings.max(), image_embeddings.min(), torch.abs(image_embeddings).mean())
	np.savez(feature_path, image_embeddings, duration)
	else:
	#print("video exist", feature_path)
	data = np.load(feature_path)
	image_embeddings = torch.from_numpy(data["arr_0"])
	#print("image_embeddings", image_embeddings.shape, image_embeddings.max(), image_embeddings.min(), torch.abs(image_embeddings).mean())
	duration = data["arr_1"].item()
	if max_sample is None:
	max_sample = int(duration * self.sampling_rate)
	interpolated = []
	for i in range(start_sample, max_sample, self.frame_size):
	j = min(round((i+self.frame_size//2) / self.sampling_rate / (duration / (image_embeddings.shape[0] - 1))), image_embeddings.shape[0] - 1)
	interpolated.append(image_embeddings[j:j+1])
	if len(interpolated) >= l:
	break
	interpolated = torch.cat(interpolated, dim=0)
	#ll = list(range(start_sample, max_sample, self.frame_size))
	#print("encode_video l", len(ll), l, round((ll[-1]+self.frame_size//2) / self.sampling_rate / (duration / (image_embeddings.shape[0] - 1))), image_embeddings.shape[0] - 1)
	#print("encode_video one", video_path, duration, image_embeddings.shape, interpolated.shape, l)
	video_embeddings.append(interpolated.unsqueeze(0))
	video_lens.append(interpolated.shape[1])
	max_length = max(video_lens)
	if max_length == 0:
	max_length = l
	else:
	max_length = l
	for i in range(len(video_embeddings)):
	if video_embeddings[i] is None:
	video_embeddings[i] = torch.zeros(1, max_length, d)
	continue
	if video_embeddings[i].shape[1] < max_length:
	video_embeddings[i] = torch.cat([video_embeddings[i], torch.zeros(1, max_length-video_embeddings[i].shape[1], d)], 1)
	video_embeddings = torch.cat(video_embeddings, 0)
	#print("encode_video", l, video_embeddings.shape, video_lens)
	return video_embeddings.to(device)

	@staticmethod
	def encode_video_frames(video_paths, l):
	#### skip video frames
	train_video_encoder = True
	if not train_video_encoder:
	midi_gts = []
	for video_path in video_paths:
	if video_path is None:
	#midi_gts.append(None)
	continue
	if isinstance(video_path, tuple):
	video_path, start_sample, max_sample = video_path
	else:
	start_sample = 0
	max_sample = None
	####if video_path.startswith("/ailab-train2/speech/zhanghaomin/scps/instruments/"):
	if "/piano_2h_cropped2_cuts/" in video_path:
	pass
	else:
	#midi_gts.append(None)
	continue
	####midi_gt
	midi_gt = torch.from_numpy(np.load(video_path.replace(".mp4", ".3.npy")).astype(np.float32))[:,NOTTE_MIN:NOTE_MAX+1]
	#print("midi_gt", midi_gt.shape, midi_gt.max(), midi_gt.min(), torch.abs(midi_gt).mean())
	midi_gts.append(midi_gt.unsqueeze(0))
	if len(midi_gts) == 0:
	return None, None
	max_length = l
	for i in range(len(midi_gts)):
	if midi_gts[i] is None:
	midi_gts[i] = torch.zeros(1, max_length, NOTES)
	continue
	if midi_gts[i].shape[1] < max_length:
	midi_gts[i] = torch.cat([midi_gts[i], torch.zeros(1, max_length-midi_gts[i].shape[1], NOTES)], 1)
	elif midi_gts[i].shape[1] > max_length:
	midi_gts[i] = midi_gts[i][:, :max_length, :]
	midi_gts = torch.cat(midi_gts, 0)
	video_frames = 1.0
	#print("encode_video_frames", l, midi_gts.shape, midi_gts.sum())
	return video_frames, midi_gts

	video_frames = []
	video_lens = []
	midi_gts = []
	for video_path in video_paths:
	if video_path is None:
	#video_frames.append(None)
	video_lens.append(0)
	#midi_gts.append(None)
	continue
	if isinstance(video_path, tuple):
	video_path, start_sample, max_sample = video_path
	else:
	start_sample = 0
	max_sample = None

	####if video_path.startswith("/ailab-train2/speech/zhanghaomin/scps/instruments/"):
	if "/piano_2h_cropped2_cuts/" in video_path:
	frames_raw_path = video_path.replace(".mp4", ".generated_frames_raw.2.npz")
	if not os.path.exists(frames_raw_path):
	frames, duration = read_frames_with_moviepy(video_path, max_frame_nums=None)
	if frames is None:
	#video_frames.append(None)
	video_lens.append(0)
	#midi_gts.append(None)
	continue
	#print("raw image size", frames.shape, video_path)
	frames_resized = []
	for i in range(frames.shape[0]):
	########frames_resized.append(np.asarray(Image.fromarray(frames[i]).resize((320, 320))))
	####frames_resized.append(np.asarray(Image.fromarray(frames[i]).resize((1024, 128))))

	input_img = Image.fromarray(frames[i]).convert('L')
	binarr = np.array(input_img)
	input_img = Image.fromarray(binarr.astype(np.uint8))
	frames_resized.append(transform(input_img))
	####frames_raw = np.array(frames_resized)
	frames_raw = np.concatenate(frames_resized).astype(np.float32)[...,np.newaxis]
	np.savez(frames_raw_path, frames_raw, duration)
	else:
	data = np.load(frames_raw_path)
	frames_raw = data["arr_0"]
	duration = data["arr_1"].item()
	####frames_raw = frames_raw.astype(np.float32) / 255.0
	#v_frames_raw = frames_raw[1:,:,:,:] - frames_raw[:-1,:,:,:]
	#v_frames_raw = np.concatenate((np.zeros((1,v_frames_raw.shape[1],v_frames_raw.shape[2],v_frames_raw.shape[3]), dtype=np.float32), v_frames_raw), axis=0)
	##print("v_frames_raw", v_frames_raw.shape, v_frames_raw.max(), v_frames_raw.min(), np.abs(v_frames_raw).mean(), np.abs(v_frames_raw[0,:,:,:]).mean())
	#frames_raw = np.concatenate((frames_raw, v_frames_raw), axis=3)
	frames_raw = torch.from_numpy(frames_raw)
	#print("frames_raw", frames_raw.shape, frames_raw.max(), frames_raw.min(), torch.abs(frames_raw).mean(), "image_embeddings", image_embeddings.shape, image_embeddings.max(), image_embeddings.min(), torch.abs(image_embeddings).mean())
	else:
	#video_frames.append(None)
	video_lens.append(0)
	#midi_gts.append(None)
	continue
	#print("frames_raw", frames_raw.shape, l)

	if max_sample is None:
	max_sample = int(duration * 24000)

	####video_multi = 3.0
	video_multi = 2.5

	interpolated_frames_raw = []
	frame_size_video = int(video_multi*320)
	for i in range(start_sample, max_sample+frame_size_video, frame_size_video):
	j = min(round(i / 24000 / (duration / (frames_raw.shape[0] - 0))), frames_raw.shape[0] - 1)
	#print(j)
	interpolated_frames_raw.append(frames_raw[j:j+1])
	if len(interpolated_frames_raw) >= math.floor(l/video_multi)+1:
	#print("break", len(interpolated_frames_raw), l, frames_raw.shape, j)
	break
	interpolated_frames_raw = torch.cat(interpolated_frames_raw, dim=0)

	####v_interpolated_frames_raw = interpolated_frames_raw[1:,:,:,:] - interpolated_frames_raw[:-1,:,:,:]
	####v_interpolated_frames_raw = torch.cat((torch.zeros(1,v_interpolated_frames_raw.shape[1],v_interpolated_frames_raw.shape[2],v_interpolated_frames_raw.shape[3]), v_interpolated_frames_raw), 0)
	#####print("v_interpolated_frames_raw", v_interpolated_frames_raw.shape, v_interpolated_frames_raw.max(), v_interpolated_frames_raw.min(), torch.abs(v_interpolated_frames_raw).mean(), torch.abs(v_interpolated_frames_raw[0,:,:,:]).mean())
	####interpolated_frames_raw = torch.cat((interpolated_frames_raw, v_interpolated_frames_raw), 3)

	video_frames.append(interpolated_frames_raw.unsqueeze(0))
	video_lens.append(interpolated_frames_raw.shape[0])

	####midi_gt
	####midi_gt = torch.from_numpy(np.load(video_path.replace(".mp4", ".3.npy")).astype(np.float32))[:,NOTTE_MIN:NOTE_MAX+1]
	#####print("midi_gt", midi_gt.shape, midi_gt.max(), midi_gt.min(), torch.abs(midi_gt).mean())
	####midi_gts.append(midi_gt.unsqueeze(0))
	midi_gts.append(None)

	if len(video_frames) == 0:
	return None, None

	max_length = max(video_lens)
	if max_length == 0:
	max_length = l
	else:
	max_length = l
	max_length_video = max(math.floor(l/video_multi)+1, max(video_lens))

	for i in range(len(video_frames)):
	if video_frames[i] is None:
	########video_frames[i] = torch.zeros(1, max_length_video, 320, 320, 6)
	####video_frames[i] = torch.zeros(1, max_length_video, 128, 1024, 6)
	video_frames[i] = torch.zeros(1, max_length_video, 100, 900, 1)
	continue
	if video_frames[i].shape[1] < max_length_video:
	########video_frames[i] = torch.cat([video_frames[i], torch.zeros(1, max_length_video-video_frames[i].shape[1], 320, 320, 6)], 1)
	####video_frames[i] = torch.cat([video_frames[i], torch.zeros(1, max_length_video-video_frames[i].shape[1], 128, 1024, 6)], 1)
	video_frames[i] = torch.cat([video_frames[i], torch.zeros(1, max_length_video-video_frames[i].shape[1], 100, 900, 1)], 1)
	video_frames = torch.cat(video_frames, 0)
	video_frames = video_frames.permute(0,4,1,2,3)
	for i in range(len(midi_gts)):
	if midi_gts[i] is None:
	midi_gts[i] = torch.zeros(1, max_length, NOTES)
	continue
	if midi_gts[i].shape[1] < max_length:
	midi_gts[i] = torch.cat([midi_gts[i], torch.zeros(1, max_length-midi_gts[i].shape[1], NOTES)], 1)
	elif midi_gts[i].shape[1] > max_length:
	midi_gts[i] = midi_gts[i][:, :max_length, :]
	midi_gts = torch.cat(midi_gts, 0)
	#print("encode_video_frames", l, video_frames.shape, video_lens, midi_gts.shape, midi_gts.sum())
	return video_frames, midi_gts

	def transformer_with_pred_head(
	self,
	x: Float['b n d'],
	cond: Float['b n d'] \| None = None,
	times: Float['b'] \| None = None,
	mask: Bool['b n'] \| None = None,
	text: Int['b nt'] \| Float['b nt dt'] \| None = None,
	frames_embed: Float['b nf df'] \| None = None,
	prompt = None,
	video_drop_prompt = None,
	audio_drop_prompt = None,
	drop_audio_cond: bool \| None = None,
	drop_text_cond: bool \| None = None,
	drop_text_prompt: bool \| None = None,
	return_drop_conditions = False
	):
	seq_len = x.shape[-2]
	bs = x.shape[0]
	drop_audio_cond = [default(drop_audio_cond, self.training and random() < self.audiocond_drop_prob) for _ in range(bs)]
	drop_text_cond = default(drop_text_cond, self.training and random() < self.cond_drop_prob)
	drop_text_prompt = [default(drop_text_prompt, self.training and random() < self.prompt_drop_prob) for _ in range(bs)]

	if cond is not None:
	for b in range(bs):
	if drop_audio_cond[b]:
	cond[b] = 0
	if audio_drop_prompt is not None and audio_drop_prompt[b]:
	cond[b] = 0

	if cond is not None:
	if self.concat_cond:
	# concat condition, given as using voicebox-like scheme
	x = torch.cat((cond, x), dim = -1)

	x = self.proj_in(x)

	if cond is not None:
	if not self.concat_cond:
	# an alternative is to simply sum the condition
	# seems to work fine

	cond = self.cond_proj_in(cond)
	x = x + cond

	# whether to use a text embedding

	text_embed = None
	if exists(text) and len(text.shape) == 3:
	text_embed = text.clone()
	if drop_text_cond:
	for b in range(bs):
	text_embed[b] = 0
	elif exists(text) and not drop_text_cond:
	text_embed = self.embed_text(text, seq_len, mask = mask)

	context, context_mask = None, None
	if prompt is not None:
	#for b in range(bs):
	# if drop_text_prompt[b]:
	# prompt[b] = ""
	if video_drop_prompt is not None:
	for b in range(bs):
	if video_drop_prompt[b]:
	prompt[b] = "the sound of X X"
	context, context_mask = self.encode_text(prompt)
	for b in range(bs):
	if drop_text_prompt[b]:
	context[b] = 0
	if video_drop_prompt is not None and video_drop_prompt[b]:
	context[b] = 0
	#print("cross attention", context.shape, context_mask.shape, x.shape, mask.shape, text_embed.shape if text_embed is not None else None, torch.mean(torch.abs(text_embed), dim=(1,2)))
	#print("video_drop_prompt", prompt, video_drop_prompt, context.shape, torch.mean(torch.abs(context), dim=(1,2)))
	#print("audio_drop_prompt", audio_drop_prompt, cond.shape, torch.mean(torch.abs(cond), dim=(1,2)))

	if self.proj_text is not None:
	text_embed = self.proj_text(text_embed)
	frames_embed = self.proj_frames(frames_embed)

	# attend

	attended = self.transformer(
	x,
	times = times,
	mask = mask,
	text_embed = text_embed,
	frames_embed = frames_embed,
	context = context,
	context_mask = context_mask
	)

	pred = self.to_pred(attended)

	if not return_drop_conditions:
	return pred

	return pred, drop_audio_cond, drop_text_cond, drop_text_prompt

	def cfg_transformer_with_pred_head(
	self,
	*args,
	cfg_strength: float = 1.,
	remove_parallel_component: bool = True,
	keep_parallel_frac: float = 0.,
	**kwargs,
	):

	pred = self.transformer_with_pred_head(args, drop_audio_cond = False, drop_text_cond = False, drop_text_prompt = False, *kwargs)

	if cfg_strength < 1e-5:
	return pred

	null_pred = self.transformer_with_pred_head(args, drop_audio_cond = True, drop_text_cond = True, drop_text_prompt = True, *kwargs)

	cfg_update = pred - null_pred

	if remove_parallel_component:
	# https://arxiv.org/abs/2410.02416
	parallel, orthogonal = project(cfg_update, pred)
	cfg_update = orthogonal + parallel * keep_parallel_frac

	return pred + cfg_update * cfg_strength

	def add_noise(self, signal, mask, val):
	if self.audiocond_snr is None:
	return signal
	if not val:
	snr = np.random.uniform(self.audiocond_snr[0], self.audiocond_snr[1])
	else:
	snr = (self.audiocond_snr[0] + self.audiocond_snr[1]) / 2.0
	#print("add_noise", self.audiocond_snr, snr, signal.shape, mask) # [True, ..., False]
	noise = torch.randn_like(signal)
	w = torch.abs(signal[mask]).mean() / (torch.abs(noise[mask]).mean() + 1e-6) / snr
	return signal + noise * w

	@torch.no_grad()
	def sample(
	self,
	cond: Float['b n d'] \| Float['b nw'] \| None = None,
	*,
	text: Int['b nt'] \| list[str] \| None = None,
	lens: Int['b'] \| None = None,
	duration: int \| Int['b'] \| None = None,
	steps = 32,
	cfg_strength = 1., # they used a classifier free guidance strength of 1.
	remove_parallel_component = True,
	sway_sampling = True,
	max_duration = 4096, # in case the duration predictor goes haywire
	vocoder: Callable[[Float['b d n']], list[Float['_']]] \| None = None,
	return_raw_output: bool \| None = None,
	save_to_filename: str \| None = None,
	prompt = None,
	video_drop_prompt = None,
	audio_drop_prompt = None,
	video_paths = None,
	frames = None,
	midis = None
	) -> (
	Float['b n d'],
	list[Float['_']]
	):
	self.eval()

	# raw wave

	if cond.ndim == 2:
	cond = self.mel_spec(cond)
	cond = rearrange(cond, 'b d n -> b n d')
	assert cond.shape[-1] == self.num_channels

	batch, cond_seq_len, device = *cond.shape[:2], cond.device

	if frames is None:
	frames_embed = torch.zeros(batch, cond_seq_len, NOTES, device=device)
	else:
	#### sampling settings
	train_video_encoder = True
	if train_video_encoder:
	frames_embed = self.encode_frames(frames, cond_seq_len)
	else:
	frames_embed = midis
	if frames_embed.shape[1] < cond_seq_len:
	frames_embed = torch.cat([frames_embed, torch.zeros(1, cond_seq_len-frames_embed.shape[1], NOTES)], 1)
	elif frames_embed.shape[1] > cond_seq_len:
	frames_embed = frames_embed[:, :cond_seq_len, :]
	#x0 = torch.zeros(batch, cond_seq_len, 128, device=device)
	print("frames_embed midis cond", frames_embed.shape if frames_embed is not None and not isinstance(frames_embed, float) else frames_embed, frames_embed.sum() if frames_embed is not None and not isinstance(frames_embed, float) else frames_embed, midis.shape if midis is not None else midis, midis.sum() if midis is not None else midis, cond.shape if cond is not None else cond, cond.sum() if cond is not None else cond)

	if not exists(lens):
	lens = torch.full((batch,), cond_seq_len, device = device, dtype = torch.long)

	if video_paths is not None:
	text = self.encode_video(video_paths, cond_seq_len)
	# text
	elif isinstance(text, list):
	text = self.tokenizer(text).to(device)
	assert text.shape[0] == batch

	if exists(text):
	text_lens = (text != -1).sum(dim = -1)
	lens = torch.maximum(text_lens, lens) # make sure lengths are at least those of the text characters

	# duration

	cond_mask = lens_to_mask(lens)

	if exists(duration):
	if isinstance(duration, int):
	duration = torch.full((batch,), duration, device = device, dtype = torch.long)

	elif exists(self.duration_predictor):
	duration = self.duration_predictor(cond, text = text, lens = lens, return_loss = False).long()

	duration = torch.maximum(lens, duration) # just add one token so something is generated
	duration = duration.clamp(max = max_duration)

	assert duration.shape[0] == batch

	max_duration = duration.amax()

	cond = F.pad(cond, (0, 0, 0, max_duration - cond_seq_len), value = 0.)
	cond_mask = F.pad(cond_mask, (0, max_duration - cond_mask.shape[-1]), value = False)
	cond_mask = rearrange(cond_mask, '... -> ... 1')

	mask = lens_to_mask(duration)
	#print("mask", duration, mask, mask.shape, lens, cond_mask, cond_mask.shape, text)

	# neural ode

	def fn(t, x):
	# at each step, conditioning is fixed

	if lens[0] == duration[0]:
	print("No cond", lens, duration)
	step_cond = None
	else:
	step_cond = torch.where(cond_mask, self.add_noise(cond, cond_mask, True), torch.zeros_like(cond))
	#step_cond = cond

	# predict flow

	return self.cfg_transformer_with_pred_head(
	x,
	step_cond,
	times = t,
	text = text,
	frames_embed = frames_embed,
	mask = mask,
	prompt = prompt,
	video_drop_prompt = video_drop_prompt,
	audio_drop_prompt = audio_drop_prompt,
	cfg_strength = cfg_strength,
	remove_parallel_component = remove_parallel_component
	)

	####torch.manual_seed(0)
	y0 = torch.randn_like(cond)
	#y0 = torch.randn_like(x0)
	t = torch.linspace(0, 1, steps, device = self.device)
	if sway_sampling:
	t = t + -1.0 * (torch.cos(torch.pi / 2 * t) - 1 + t)
	#print("@@@@", t)

	trajectory = odeint(fn, y0, t, **self.odeint_kwargs)
	sampled = trajectory[-1]

	out = sampled

	if lens[0] != duration[0]:
	out = torch.where(cond_mask, cond, out)

	# able to return raw untransformed output, if not using mel rep

	if exists(return_raw_output) and return_raw_output:
	return out

	# take care of transforming mel to audio if `vocoder` is passed in, or if `use_vocos` is turned on

	if exists(vocoder):
	assert not exists(self.vocos), '`use_vocos` should not be turned on if you are passing in a custom `vocoder` on sampling'
	out = rearrange(out, 'b n d -> b d n')
	out = vocoder(out)

	elif exists(self.vocos):

	audio = []
	for mel, one_mask in zip(out, mask):
	#one_out = DB_to_amplitude(mel[one_mask], ref = 1., power = 0.5)
	one_out = mel[one_mask]

	one_out = rearrange(one_out, 'n d -> 1 d n')
	one_audio = self.vocos.decode(one_out)
	one_audio = rearrange(one_audio, '1 nw -> nw')
	audio.append(one_audio)

	out = audio

	if exists(save_to_filename):
	assert exists(vocoder) or exists(self.vocos)
	assert exists(self.sampling_rate)

	path = Path(save_to_filename)
	parent_path = path.parents[0]
	parent_path.mkdir(exist_ok = True, parents = True)

	for ind, one_audio in enumerate(out):
	one_audio = rearrange(one_audio, 'nw -> 1 nw')
	if len(out) == 1:
	save_path = str(parent_path / f'{path.name}')
	else:
	save_path = str(parent_path / f'{ind + 1}.{path.name}')
	torchaudio.save(save_path, one_audio.detach().cpu(), sample_rate = self.sampling_rate)

	return out

	def forward(
	self,
	inp: Float['b n d'] \| Float['b nw'], # mel or raw wave
	*,
	text: Int['b nt'] \| list[str] \| None = None,
	times: int \| Int['b'] \| None = None,
	lens: Int['b'] \| None = None,
	velocity_consistency_model: E2TTS \| None = None,
	velocity_consistency_delta = 1e-3,
	prompt = None,
	video_drop_prompt=None,
	audio_drop_prompt=None,
	val = False,
	video_paths=None,
	frames=None,
	midis=None
	):
	need_velocity_loss = exists(velocity_consistency_model) and self.velocity_consistency_weight > 0.

	# handle raw wave

	if inp.ndim == 2:
	inp = self.mel_spec(inp)
	inp = rearrange(inp, 'b d n -> b n d')
	assert inp.shape[-1] == self.num_channels

	batch, seq_len, dtype, device = *inp.shape[:2], inp.dtype, self.device

	if video_paths is not None:
	text = self.encode_video(video_paths, seq_len)
	# handle text as string
	elif isinstance(text, list):
	text = self.tokenizer(text).to(device)
	#print("text tokenized", text[0])
	assert text.shape[0] == batch

	# lens and mask

	if not exists(lens):
	lens = torch.full((batch,), seq_len, device = device)

	mask = lens_to_mask(lens, length = seq_len)

	# get a random span to mask out for training conditionally

	if not val:
	if self.audiocond_drop_prob > 1.0:
	frac_lengths = torch.zeros((batch,), device = self.device).float().uniform_(1.0,1.0)
	else:
	frac_lengths = torch.zeros((batch,), device = self.device).float().uniform_(*self.frac_lengths_mask)
	else:
	frac_lengths = torch.tensor([(0.7+1.0)/2.0]*batch, device = self.device).float()
	rand_span_mask = mask_from_frac_lengths(lens, frac_lengths, max_length = seq_len, val = val)

	if exists(mask):
	rand_span_mask &= mask

	# mel is x1

	x1 = inp

	# main conditional flow training logic
	# just ~5 loc

	# x0 is gaussian noise

	if val:
	torch.manual_seed(0)
	x0 = torch.randn_like(x1)
	if val:
	torch.manual_seed(int(time.time()*1000))

	# t is random times from above

	if times is None:
	times = torch.rand((batch,), dtype = dtype, device = self.device)
	else:
	times = torch.tensor((times,)*batch, dtype = dtype, device = self.device)
	t = rearrange(times, 'b -> b 1 1')

	# if need velocity consistency, make sure time does not exceed 1.

	if need_velocity_loss:
	t = t * (1. - velocity_consistency_delta)

	# sample xt (w in the paper)

	w = (1. - t) * x0 + t * x1

	flow = x1 - x0

	# only predict what is within the random mask span for infilling

	if self.audiocond_drop_prob > 1.0:
	cond = None
	else:
	cond = einx.where(
	'b n, b n d, b n d -> b n d',
	rand_span_mask,
	torch.zeros_like(x1), self.add_noise(x1, ~rand_span_mask, val)
	)

	#### training settings
	train_video_encoder = True
	train_v2a = True
	use_midi_gt = False
	train_video_encoder = train_video_encoder
	####train_v2a = train_v2a or val
	#print("train_video_encoder", train_video_encoder, use_midi_gt, train_v2a)

	####
	if frames is None:
	frames_embed = torch.zeros(batch, seq_len, NOTES, device=device)
	midis = torch.zeros(batch, seq_len, NOTES, device=device)
	else:
	if train_video_encoder:
	frames_embed = self.encode_frames(frames, seq_len)
	else:
	frames_embed = midis
	#print("frames_embed midis cond", frames_embed.shape if frames_embed is not None and not isinstance(frames_embed, float) else frames_embed, frames_embed.sum() if frames_embed is not None and not isinstance(frames_embed, float) else frames_embed, midis.shape, midis.sum(), cond.shape if cond is not None else cond, cond.sum() if cond is not None else cond, x1.shape)
	if train_video_encoder:
	#lw = 1.0
	lw = torch.abs(midis-0.10)
	#lw = torch.max(torch.abs(midis-0.20), torch.tensor(0.20))
	loss_midi = F.mse_loss(frames_embed, midis, reduction = 'none') * lw
	#loss_midi = nn.BCELoss(reduction = 'none')(frames_embed, midis) * lw
	#print("loss_midi", loss_midi.shape, mask.shape, mask, rand_span_mask.shape, rand_span_mask)
	loss_midi = loss_midi[mask[-frames_embed.shape[0]:,...]].mean()

	b, t, f = frames_embed.shape
	frames_embed_t = frames_embed[:,:(t//3)*3,:].reshape(b,t//3,3,f).mean(dim=2)
	midis_t = midis[:,:(t//3)*3,:].reshape(b,t//3,3,f).mean(dim=2)
	mask_t = mask[-frames_embed.shape[0]:,:(t//3)*3].reshape(b,t//3,3).to(torch.float32).mean(dim=2) >= 0.99
	tp = ((frames_embed_t>=0.4)*(midis_t>=0.5)).to(torch.float)[mask_t[-frames_embed_t.shape[0]:,...]].sum()
	fp = ((frames_embed_t>=0.4)*(midis_t<0.5)).to(torch.float)[mask_t[-frames_embed_t.shape[0]:,...]].sum()
	fn = ((frames_embed_t<0.4)*(midis_t>=0.5)).to(torch.float)[mask_t[-frames_embed_t.shape[0]:,...]].sum()
	tn = ((frames_embed_t<0.4)*(midis_t<0.5)).to(torch.float)[mask_t[-frames_embed_t.shape[0]:,...]].sum()
	#print("tp fp fn tn", tp, fp, fn, tn)
	pre = tp / (tp + fp) if (tp + fp) != 0 else torch.tensor(0.0, device=device)
	rec = tp / (tp + fn) if (tp + fn) != 0 else torch.tensor(0.0, device=device)
	f1 = 2 * tp / (2 * tp + fp + fn) if (2 * tp + fp + fn) != 0 else torch.tensor(0.0, device=device)
	acc = tp / (tp + fp + fn) if (tp + fp + fn) != 0 else torch.tensor(0.0, device=device)
	else:
	loss_midi = torch.tensor(0.0, device=device)

	tp = torch.tensor(0.0, device=device)
	fp = torch.tensor(0.0, device=device)
	fn = torch.tensor(0.0, device=device)
	tn = torch.tensor(0.0, device=device)
	pre = torch.tensor(0.0, device=device)
	rec = torch.tensor(0.0, device=device)
	f1 = torch.tensor(0.0, device=device)
	acc = torch.tensor(0.0, device=device)
	#if train_video_encoder: # loss_midi_zeros * 100.0 # 0.2131/0.1856 # 0.2819/0.2417 # 2.451/
	# loss_midi_zeros = F.mse_loss(torch.zeros_like(midis), midis, reduction = 'none')
	# loss_midi_zeros = loss_midi_zeros[mask[-frames_embed.shape[0]:,...]].mean()
	#else:
	# loss_midi_zeros = torch.tensor(0.0, device=device)

	if train_v2a:
	if use_midi_gt:
	frames_embed = midis
	if frames_embed.shape[0] < x1.shape[0]:
	frames_embed = torch.cat((torch.zeros(x1.shape[0]-frames_embed.shape[0],frames_embed.shape[1],frames_embed.shape[2],device=frames_embed.device), frames_embed), 0)

	# transformer and prediction head

	if not val:
	pred, did_drop_audio_cond, did_drop_text_cond, did_drop_text_prompt = self.transformer_with_pred_head(
	w,
	cond,
	times = times,
	text = text,
	frames_embed = frames_embed,
	mask = mask,
	prompt = prompt,
	video_drop_prompt = video_drop_prompt,
	audio_drop_prompt = audio_drop_prompt,
	return_drop_conditions = True
	)
	else:
	pred, did_drop_audio_cond, did_drop_text_cond, did_drop_text_prompt = self.transformer_with_pred_head(
	w,
	cond,
	times = times,
	text = text,
	frames_embed = frames_embed,
	mask = mask,
	prompt = prompt,
	video_drop_prompt = video_drop_prompt,
	audio_drop_prompt = audio_drop_prompt,
	drop_audio_cond = False,
	drop_text_cond = False,
	drop_text_prompt = False,
	return_drop_conditions = True
	)

	# maybe velocity consistency loss

	velocity_loss = self.zero

	if need_velocity_loss:

	#t_with_delta = t + velocity_consistency_delta
	#w_with_delta = (1. - t_with_delta) * x0 + t_with_delta * x1

	with torch.no_grad():
	ema_pred = velocity_consistency_model.transformer_with_pred_head(
	w, #w_with_delta,
	cond,
	times = times, #times + velocity_consistency_delta,
	text = text,
	frames_embed = frames_embed,
	mask = mask,
	prompt = prompt,
	video_drop_prompt = video_drop_prompt,
	audio_drop_prompt = audio_drop_prompt,
	drop_audio_cond = did_drop_audio_cond,
	drop_text_cond = did_drop_text_cond,
	drop_text_prompt = did_drop_text_prompt
	)

	#velocity_loss = F.mse_loss(pred, ema_pred, reduction = 'none')
	velocity_loss = F.mse_loss(ema_pred, flow, reduction = 'none')
	velocity_loss = (velocity_loss.mean(-1)*rand_span_mask).mean(-1) #.mean()

	ref_losses = velocity_loss[-2:, ...]
	ref_losses_w, ref_losses_l = ref_losses.chunk(2)
	raw_ref_loss = 0.5 * (ref_losses_w.mean() + ref_losses_l.mean())
	ref_diff = ref_losses_w - ref_losses_l
	else:
	ref_losses_w, ref_losses_l = 0, 0

	# flow matching loss

	loss = F.mse_loss(pred, flow, reduction = 'none')

	#print("loss", loss.shape, loss, "rand_span_mask", rand_span_mask.shape, rand_span_mask, "loss[rand_span_mask]", loss[rand_span_mask].shape, loss[rand_span_mask])

	#### dpo
	loss = loss[rand_span_mask].mean()
	loss_dpo = torch.tensor(0.0, device=device)
	####if val:
	#### loss = loss[rand_span_mask].mean()
	#### loss_dpo = torch.tensor(0.0, device=device)
	#### model_losses_w, model_losses_l = 0, 0
	####else:
	#### loss_fm = loss[rand_span_mask].mean()
	#### loss = (loss.mean(-1)*rand_span_mask).mean(-1) #.mean()
	####
	#### model_losses = loss[-2:, ...]
	#### model_losses_w, model_losses_l = model_losses.chunk(2)
	#### raw_model_loss = 0.5 * (model_losses_w.mean() + model_losses_l.mean())
	#### model_diff = model_losses_w - model_losses_l
	####
	#### scale_term = -1
	#### inside_term = scale_term * (model_diff - ref_diff)
	#### loss_dpo = -1 * F.logsigmoid(inside_term).mean()
	#### loss = loss_fm
	#### dpo
	else:
	pred = torch.zeros_like(x0)
	loss = torch.tensor(0.0, device=device)

	# total loss and get breakdown

	#midi_w = 100.0
	midi_w = 10.0
	#total_loss = loss
	#total_loss = loss + loss_midi * midi_w
	total_loss = loss + loss_midi * midi_w + loss_dpo
	####breakdown = LossBreakdown(loss, loss_midi * midi_w, pre, rec)
	breakdown = LossBreakdown(pre, rec, f1, acc)
	#breakdown = LossBreakdown(tp, fp, fn, tn)
	#### dpo
	print("loss", loss, loss_midi * midi_w)
	#print("loss", loss, loss_midi * midi_w, loss_dpo, model_losses_w, model_losses_l, ref_losses_w, ref_losses_l)
	#### dpo

	# return total loss and bunch of intermediates

	return E2TTSReturn(total_loss, cond if cond is not None else w, pred, x0 + pred, breakdown)