futureProofGlitch
/

distilhubert-finetuned-gtzan

+# -*- coding: utf-8 -*-
+"""Copy of AudioCourse_MusicGenreClassifier_P2.ipynb
+Automatically generated by Colaboratory.
+Original file is located at
+    https://colab.research.google.com/drive/19qAGS31MqX04p9EeUgAM2dGvI3SB3pm6
+"""
+!pip install --upgrade transformers
+!pip install datasets
+!pip install gradio
+from datasets import load_dataset
+gtzan = load_dataset("marsyas/gtzan", "all")
+gtzan
+# GTZAN does not provide a split in the dataset, so we are creating one
+# ourselves
+gtzan = gtzan['train'].train_test_split(test_size=0.1, shuffle=True, seed=42)
+gtzan
+gtzan['train'][0]
+# genre is represented as an integer, so let’s use the int2str() method of
+# the genre feature to map these integers to human-readable names
+id2label_fn = gtzan['train'].features['genre'].int2str
+id2label_fn(gtzan['train'][0]['genre'])
+# Let’s now listen to a few more examples by using Gradio to create a simple
+# interface with the Blocks API
+import gradio as gr
+def generate_audio():
+    example = gtzan["train"].shuffle()[0]
+    audio = example["audio"]
+    return (
+        audio["sampling_rate"],
+        audio["array"],
+    ), id2label_fn(example["genre"])
+with gr.Blocks() as demo:
+    with gr.Column():
+        for _ in range(4):
+            audio, label = generate_audio()
+            output = gr.Audio(audio, label=label)
+demo.launch(debug=True)
+from transformers import AutoFeatureExtractor
+model_id = "ntu-spml/distilhubert"
+feature_extractor = AutoFeatureExtractor.from_pretrained(
+    model_id, do_normalize=True, return_attention_mask=True
+)
+# As we have seen above, the sampling rate of the audio samples
+# in the dataset is = 22KHz(approx.) let's find the sampling rate accepted by the
+# model
+sampling_rate = feature_extractor.sampling_rate
+sampling_rate
+# The model needs 16Khz samples so we can use the cast_column() method to
+# downsample the examples to match the requirements of the model.
+from datasets import Audio
+gtzan = gtzan.cast_column("audio", Audio(sampling_rate=sampling_rate))
+# Let's verify if the changes were successful.
+gtzan['train'][0]
+# Works! However, I noticed that the 1-D NP Array of the audio has changed.
+# What exactly did the feature extractor do?
+# Looking into this
+import numpy as np
+test_sample = gtzan['train'][0]['audio']
+print(f"Mean: {np.mean(test_sample['array']):.3},\n",
+      f"Variance: {np.var(test_sample['array']):.3}")
+inputs = feature_extractor(test_sample["array"],
+                           sampling_rate=test_sample["sampling_rate"])
+print(f"inputs keys: {list(inputs.keys())}")
+print(
+    f"Mean: {np.mean(inputs['input_values']):.3}, Variance: {np.var(inputs['input_values']):.3}"
+)
+# the model cannot process audio samples above 30secs,
+# therefore, wee need to trucate examples that have longer durations.
+# Let's define a method to set the max_duration and use the feature_extractor
+# class on a single sample.
+# Later, we can use the .map() method to apply the same for all samples.
+max_duration = 30.0
+def preprocess_function(examples):
+    audio_arrays = [x["array"] for x in examples["audio"]]
+    inputs = feature_extractor(
+        audio_arrays,
+        sampling_rate=feature_extractor.sampling_rate,
+        max_length=int(feature_extractor.sampling_rate * max_duration),
+        truncation=True,
+        return_attention_mask=True,
+    )
+    return inputs
+gtzan_encoded = gtzan.map(
+    preprocess_function,
+    remove_columns=["audio", "file"],
+    batched=True,
+    batch_size=100,
+    num_proc=1,
+)
+gtzan_encoded
+gtzan_encoded = gtzan_encoded.rename_column("genre", "label")
+id2label = {
+    str(i): id2label_fn(i)
+    for i in range(len(gtzan_encoded["train"].features["label"].names))
+}
+label2id = {v: k for k, v in id2label.items()}
+id2label["7"]
+# Begin fine tuning the model
+from transformers import AutoModelForAudioClassification
+num_labels = len(id2label)
+model = AutoModelForAudioClassification.from_pretrained(
+    model_id,
+    num_labels=num_labels,
+    label2id=label2id,
+    id2label=id2label,
+)
+from huggingface_hub import notebook_login
+notebook_login()
+!pip install transformers[torch]
+!pip install accelerate -U
+from transformers import TrainingArguments
+model_name = model_id.split("/")[-1]
+batch_size = 8
+gradient_accumulation_steps = 1
+num_train_epochs = 10
+training_args = TrainingArguments(
+    f"{model_name}-finetuned-gtzan",
+    evaluation_strategy="epoch",
+    save_strategy="epoch",
+    learning_rate=5e-5,
+    per_device_train_batch_size=batch_size,
+    gradient_accumulation_steps=gradient_accumulation_steps,
+    per_device_eval_batch_size=batch_size,
+    num_train_epochs=num_train_epochs,
+    warmup_ratio=0.1,
+    logging_steps=5,
+    load_best_model_at_end=True,
+    metric_for_best_model="accuracy",
+    fp16=True,
+    push_to_hub=False,
+)
+!pip install evaluate
+import evaluate
+import numpy as np
+metric = evaluate.load("accuracy")
+def compute_metrics(eval_pred):
+    """Computes accuracy on a batch of predictions"""
+    predictions = np.argmax(eval_pred.predictions, axis=1)
+    return metric.compute(predictions=predictions, references=eval_pred.label_ids)
+# Now we have got all the required pieces.
+# Instantiating the Trainer class and training the model.
+from transformers import Trainer
+trainer = Trainer(
+    model,
+    training_args,
+    train_dataset=gtzan_encoded["train"],
+    eval_dataset=gtzan_encoded["test"],
+    tokenizer=feature_extractor,
+    compute_metrics=compute_metrics,
+)
+trainer.train()
+!pip install huggingface-cli
+!huggingface-cli login
+kwargs = {
+    "dataset_tags": "marsyas/gtzan",
+    "dataset": "GTZAN",
+    "model_name": f"{model_name}-finetuned-gtzan",
+    "finetuned_from": model_id,
+    "tasks": "audio-classification",
+}