Spaces:

fakeavatar
/

vtubers-speak

Sleeping

App Files Files Community

fakeavatar commited on Mar 24

Commit

8027264

1 Parent(s): 39ce78e

update with new model and contextual seeding

Browse files

Files changed (3) hide show

app.py +23 -0
requirements.txt +2 -1
sample.wav +0 -0

app.py CHANGED Viewed

@@ -5,6 +5,9 @@ import soundfile as sf
 from transformers import AutoTokenizer, AutoModelForCausalLM
 from xcodec2.modeling_xcodec2 import XCodec2Model
 import tempfile
 device = "cuda" if torch.cuda.is_available() else "cpu"
 ####################
@@ -14,6 +17,10 @@ model_name = "fakeavatar/vtubers-4"
 print("Loading tokenizer & model ...")
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 model = AutoModelForCausalLM.from_pretrained(model_name)
 model.eval().to(device)
 print("Loading XCodec2Model ...")
@@ -23,6 +30,7 @@ Codec_model.eval().to(device)
 print("Models loaded.")
 ####################
 #  Inference function
 ####################
@@ -49,6 +57,11 @@ def text2speech(input_text, num_samples):
     """
     results = []
     with torch.no_grad():
         for i in range(0, num_samples):
             # Add start and end tokens around the input text
@@ -58,6 +71,14 @@ def text2speech(input_text, num_samples):
                 {"role": "assistant", "content": f"<|SPEECH_GENERATION_START|>"}
             ]
             # tokenizer.apply_chat_template is used in the Llasa-style dialogue model
             input_ids = tokenizer.apply_chat_template(
                 chat,
@@ -82,6 +103,8 @@ def text2speech(input_text, num_samples):
             # Extract newly generated tokens (excluding the input part)
             generated_ids = outputs[0][input_ids.shape[1]:-1]
             speech_tokens_str = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
             # Extract <|s_23456|> as [23456 ...]

 from transformers import AutoTokenizer, AutoModelForCausalLM
 from xcodec2.modeling_xcodec2 import XCodec2Model
 import tempfile
+import torchaudio
+import os
 device = "cuda" if torch.cuda.is_available() else "cpu"
 ####################
 print("Loading tokenizer & model ...")
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 model = AutoModelForCausalLM.from_pretrained(model_name)
+if os.name != "nt" and torch.cuda.is_available():  # 'nt' means Windows, so this runs on Linux/macOS
+    model = torch.compile(model)
+    torch.backends.cudnn.benchmark = True  # For variable input sizes
+    torch.backends.cuda.matmul.allow_tf32 = True  # Allow TF32 on Ampere GPUs
 model.eval().to(device)
 print("Loading XCodec2Model ...")
 print("Models loaded.")
 ####################
 #  Inference function
 ####################
     """
     results = []
     with torch.no_grad():
+        audio, sr = torchaudio.load("./sample.wav")
+        vq_code = Codec_model.encode_code(audio.to("cuda"))
+        vq_strings = [f"<|s_{i}|>" for i in vq_code.to("cpu")[0][0].tolist()]
+        vq_str = "".join(vq_strings)
         for i in range(0, num_samples):
             # Add start and end tokens around the input text
                 {"role": "assistant", "content": f"<|SPEECH_GENERATION_START|>"}
             ]
+            chat = [
+                {"role": "system", "content": "the speaker is yui. She has a mild chinese accent and is speaking english. The voice is flowing and nasal, high pitched with a measured speed. The sound is recorded in a fairly clean and carries a medium happy emotion."},
+                {"role": "user", "content": "Convert the text to speech:" + f"<|TEXT_UNDERSTANDING_START|>Hey, wake up! {input_text}<|TEXT_UNDERSTANDING_END|>"},
+                {"role": "assistant", "content": "<|SPEECH_GENERATION_START|>" + vq_str},
+                # {"role": "user", "content": formatted_text},
+                # {"role": "assistant", "content": "<|SPEECH_GENERATION_START|>"}
+            ]
             # tokenizer.apply_chat_template is used in the Llasa-style dialogue model
             input_ids = tokenizer.apply_chat_template(
                 chat,
             # Extract newly generated tokens (excluding the input part)
             generated_ids = outputs[0][input_ids.shape[1]:-1]
+            if (generated_ids.shape[0] < 2):
+                continue
             speech_tokens_str = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
             # Extract <|s_23456|> as [23456 ...]

requirements.txt CHANGED Viewed

@@ -1,2 +1,3 @@
 xcodec2==0.1.3
-soundfile

 xcodec2==0.1.3
+soundfile
+torchaudio

sample.wav ADDED Viewed

Binary file (89.6 kB). View file