fakeavatar commited on
Commit
8027264
·
1 Parent(s): 39ce78e

update with new model and contextual seeding

Browse files
Files changed (3) hide show
  1. app.py +23 -0
  2. requirements.txt +2 -1
  3. sample.wav +0 -0
app.py CHANGED
@@ -5,6 +5,9 @@ import soundfile as sf
5
  from transformers import AutoTokenizer, AutoModelForCausalLM
6
  from xcodec2.modeling_xcodec2 import XCodec2Model
7
  import tempfile
 
 
 
8
 
9
  device = "cuda" if torch.cuda.is_available() else "cpu"
10
  ####################
@@ -14,6 +17,10 @@ model_name = "fakeavatar/vtubers-4"
14
  print("Loading tokenizer & model ...")
15
  tokenizer = AutoTokenizer.from_pretrained(model_name)
16
  model = AutoModelForCausalLM.from_pretrained(model_name)
 
 
 
 
17
  model.eval().to(device)
18
 
19
  print("Loading XCodec2Model ...")
@@ -23,6 +30,7 @@ Codec_model.eval().to(device)
23
 
24
  print("Models loaded.")
25
 
 
26
  ####################
27
  # Inference function
28
  ####################
@@ -49,6 +57,11 @@ def text2speech(input_text, num_samples):
49
  """
50
  results = []
51
  with torch.no_grad():
 
 
 
 
 
52
  for i in range(0, num_samples):
53
 
54
  # Add start and end tokens around the input text
@@ -58,6 +71,14 @@ def text2speech(input_text, num_samples):
58
  {"role": "assistant", "content": f"<|SPEECH_GENERATION_START|>"}
59
  ]
60
 
 
 
 
 
 
 
 
 
61
  # tokenizer.apply_chat_template is used in the Llasa-style dialogue model
62
  input_ids = tokenizer.apply_chat_template(
63
  chat,
@@ -82,6 +103,8 @@ def text2speech(input_text, num_samples):
82
 
83
  # Extract newly generated tokens (excluding the input part)
84
  generated_ids = outputs[0][input_ids.shape[1]:-1]
 
 
85
  speech_tokens_str = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
86
 
87
  # Extract <|s_23456|> as [23456 ...]
 
5
  from transformers import AutoTokenizer, AutoModelForCausalLM
6
  from xcodec2.modeling_xcodec2 import XCodec2Model
7
  import tempfile
8
+ import torchaudio
9
+ import os
10
+
11
 
12
  device = "cuda" if torch.cuda.is_available() else "cpu"
13
  ####################
 
17
  print("Loading tokenizer & model ...")
18
  tokenizer = AutoTokenizer.from_pretrained(model_name)
19
  model = AutoModelForCausalLM.from_pretrained(model_name)
20
+ if os.name != "nt" and torch.cuda.is_available(): # 'nt' means Windows, so this runs on Linux/macOS
21
+ model = torch.compile(model)
22
+ torch.backends.cudnn.benchmark = True # For variable input sizes
23
+ torch.backends.cuda.matmul.allow_tf32 = True # Allow TF32 on Ampere GPUs
24
  model.eval().to(device)
25
 
26
  print("Loading XCodec2Model ...")
 
30
 
31
  print("Models loaded.")
32
 
33
+
34
  ####################
35
  # Inference function
36
  ####################
 
57
  """
58
  results = []
59
  with torch.no_grad():
60
+
61
+ audio, sr = torchaudio.load("./sample.wav")
62
+ vq_code = Codec_model.encode_code(audio.to("cuda"))
63
+ vq_strings = [f"<|s_{i}|>" for i in vq_code.to("cpu")[0][0].tolist()]
64
+ vq_str = "".join(vq_strings)
65
  for i in range(0, num_samples):
66
 
67
  # Add start and end tokens around the input text
 
71
  {"role": "assistant", "content": f"<|SPEECH_GENERATION_START|>"}
72
  ]
73
 
74
+ chat = [
75
+ {"role": "system", "content": "the speaker is yui. She has a mild chinese accent and is speaking english. The voice is flowing and nasal, high pitched with a measured speed. The sound is recorded in a fairly clean and carries a medium happy emotion."},
76
+ {"role": "user", "content": "Convert the text to speech:" + f"<|TEXT_UNDERSTANDING_START|>Hey, wake up! {input_text}<|TEXT_UNDERSTANDING_END|>"},
77
+ {"role": "assistant", "content": "<|SPEECH_GENERATION_START|>" + vq_str},
78
+ # {"role": "user", "content": formatted_text},
79
+ # {"role": "assistant", "content": "<|SPEECH_GENERATION_START|>"}
80
+ ]
81
+
82
  # tokenizer.apply_chat_template is used in the Llasa-style dialogue model
83
  input_ids = tokenizer.apply_chat_template(
84
  chat,
 
103
 
104
  # Extract newly generated tokens (excluding the input part)
105
  generated_ids = outputs[0][input_ids.shape[1]:-1]
106
+ if (generated_ids.shape[0] < 2):
107
+ continue
108
  speech_tokens_str = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
109
 
110
  # Extract <|s_23456|> as [23456 ...]
requirements.txt CHANGED
@@ -1,2 +1,3 @@
1
  xcodec2==0.1.3
2
- soundfile
 
 
1
  xcodec2==0.1.3
2
+ soundfile
3
+ torchaudio
sample.wav ADDED
Binary file (89.6 kB). View file