Upload audiocourse_musicgenreclassifier_p2.py
Browse files
audiocourse_musicgenreclassifier_p2.py
ADDED
@@ -0,0 +1,217 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
"""Copy of AudioCourse_MusicGenreClassifier_P2.ipynb
|
3 |
+
|
4 |
+
Automatically generated by Colaboratory.
|
5 |
+
|
6 |
+
Original file is located at
|
7 |
+
https://colab.research.google.com/drive/19qAGS31MqX04p9EeUgAM2dGvI3SB3pm6
|
8 |
+
"""
|
9 |
+
|
10 |
+
!pip install --upgrade transformers
|
11 |
+
!pip install datasets
|
12 |
+
!pip install gradio
|
13 |
+
|
14 |
+
from datasets import load_dataset
|
15 |
+
|
16 |
+
gtzan = load_dataset("marsyas/gtzan", "all")
|
17 |
+
gtzan
|
18 |
+
|
19 |
+
# GTZAN does not provide a split in the dataset, so we are creating one
|
20 |
+
# ourselves
|
21 |
+
gtzan = gtzan['train'].train_test_split(test_size=0.1, shuffle=True, seed=42)
|
22 |
+
gtzan
|
23 |
+
|
24 |
+
gtzan['train'][0]
|
25 |
+
|
26 |
+
# genre is represented as an integer, so let’s use the int2str() method of
|
27 |
+
# the genre feature to map these integers to human-readable names
|
28 |
+
id2label_fn = gtzan['train'].features['genre'].int2str
|
29 |
+
id2label_fn(gtzan['train'][0]['genre'])
|
30 |
+
|
31 |
+
# Let’s now listen to a few more examples by using Gradio to create a simple
|
32 |
+
# interface with the Blocks API
|
33 |
+
|
34 |
+
import gradio as gr
|
35 |
+
|
36 |
+
|
37 |
+
def generate_audio():
|
38 |
+
example = gtzan["train"].shuffle()[0]
|
39 |
+
audio = example["audio"]
|
40 |
+
return (
|
41 |
+
audio["sampling_rate"],
|
42 |
+
audio["array"],
|
43 |
+
), id2label_fn(example["genre"])
|
44 |
+
|
45 |
+
|
46 |
+
with gr.Blocks() as demo:
|
47 |
+
with gr.Column():
|
48 |
+
for _ in range(4):
|
49 |
+
audio, label = generate_audio()
|
50 |
+
output = gr.Audio(audio, label=label)
|
51 |
+
|
52 |
+
demo.launch(debug=True)
|
53 |
+
|
54 |
+
from transformers import AutoFeatureExtractor
|
55 |
+
|
56 |
+
model_id = "ntu-spml/distilhubert"
|
57 |
+
feature_extractor = AutoFeatureExtractor.from_pretrained(
|
58 |
+
model_id, do_normalize=True, return_attention_mask=True
|
59 |
+
)
|
60 |
+
|
61 |
+
# As we have seen above, the sampling rate of the audio samples
|
62 |
+
# in the dataset is = 22KHz(approx.) let's find the sampling rate accepted by the
|
63 |
+
# model
|
64 |
+
sampling_rate = feature_extractor.sampling_rate
|
65 |
+
sampling_rate
|
66 |
+
|
67 |
+
# The model needs 16Khz samples so we can use the cast_column() method to
|
68 |
+
# downsample the examples to match the requirements of the model.
|
69 |
+
|
70 |
+
from datasets import Audio
|
71 |
+
|
72 |
+
gtzan = gtzan.cast_column("audio", Audio(sampling_rate=sampling_rate))
|
73 |
+
|
74 |
+
# Let's verify if the changes were successful.
|
75 |
+
gtzan['train'][0]
|
76 |
+
|
77 |
+
# Works! However, I noticed that the 1-D NP Array of the audio has changed.
|
78 |
+
# What exactly did the feature extractor do?
|
79 |
+
# Looking into this
|
80 |
+
|
81 |
+
import numpy as np
|
82 |
+
test_sample = gtzan['train'][0]['audio']
|
83 |
+
print(f"Mean: {np.mean(test_sample['array']):.3},\n",
|
84 |
+
f"Variance: {np.var(test_sample['array']):.3}")
|
85 |
+
|
86 |
+
inputs = feature_extractor(test_sample["array"],
|
87 |
+
sampling_rate=test_sample["sampling_rate"])
|
88 |
+
|
89 |
+
print(f"inputs keys: {list(inputs.keys())}")
|
90 |
+
|
91 |
+
print(
|
92 |
+
f"Mean: {np.mean(inputs['input_values']):.3}, Variance: {np.var(inputs['input_values']):.3}"
|
93 |
+
)
|
94 |
+
|
95 |
+
# the model cannot process audio samples above 30secs,
|
96 |
+
# therefore, wee need to trucate examples that have longer durations.
|
97 |
+
# Let's define a method to set the max_duration and use the feature_extractor
|
98 |
+
# class on a single sample.
|
99 |
+
# Later, we can use the .map() method to apply the same for all samples.
|
100 |
+
|
101 |
+
max_duration = 30.0
|
102 |
+
|
103 |
+
|
104 |
+
def preprocess_function(examples):
|
105 |
+
audio_arrays = [x["array"] for x in examples["audio"]]
|
106 |
+
inputs = feature_extractor(
|
107 |
+
audio_arrays,
|
108 |
+
sampling_rate=feature_extractor.sampling_rate,
|
109 |
+
max_length=int(feature_extractor.sampling_rate * max_duration),
|
110 |
+
truncation=True,
|
111 |
+
return_attention_mask=True,
|
112 |
+
)
|
113 |
+
return inputs
|
114 |
+
|
115 |
+
gtzan_encoded = gtzan.map(
|
116 |
+
preprocess_function,
|
117 |
+
remove_columns=["audio", "file"],
|
118 |
+
batched=True,
|
119 |
+
batch_size=100,
|
120 |
+
num_proc=1,
|
121 |
+
)
|
122 |
+
gtzan_encoded
|
123 |
+
|
124 |
+
gtzan_encoded = gtzan_encoded.rename_column("genre", "label")
|
125 |
+
|
126 |
+
id2label = {
|
127 |
+
str(i): id2label_fn(i)
|
128 |
+
for i in range(len(gtzan_encoded["train"].features["label"].names))
|
129 |
+
}
|
130 |
+
label2id = {v: k for k, v in id2label.items()}
|
131 |
+
|
132 |
+
id2label["7"]
|
133 |
+
|
134 |
+
# Begin fine tuning the model
|
135 |
+
|
136 |
+
from transformers import AutoModelForAudioClassification
|
137 |
+
|
138 |
+
num_labels = len(id2label)
|
139 |
+
|
140 |
+
model = AutoModelForAudioClassification.from_pretrained(
|
141 |
+
model_id,
|
142 |
+
num_labels=num_labels,
|
143 |
+
label2id=label2id,
|
144 |
+
id2label=id2label,
|
145 |
+
)
|
146 |
+
|
147 |
+
from huggingface_hub import notebook_login
|
148 |
+
|
149 |
+
notebook_login()
|
150 |
+
|
151 |
+
!pip install transformers[torch]
|
152 |
+
!pip install accelerate -U
|
153 |
+
|
154 |
+
from transformers import TrainingArguments
|
155 |
+
|
156 |
+
model_name = model_id.split("/")[-1]
|
157 |
+
batch_size = 8
|
158 |
+
gradient_accumulation_steps = 1
|
159 |
+
num_train_epochs = 10
|
160 |
+
|
161 |
+
training_args = TrainingArguments(
|
162 |
+
f"{model_name}-finetuned-gtzan",
|
163 |
+
evaluation_strategy="epoch",
|
164 |
+
save_strategy="epoch",
|
165 |
+
learning_rate=5e-5,
|
166 |
+
per_device_train_batch_size=batch_size,
|
167 |
+
gradient_accumulation_steps=gradient_accumulation_steps,
|
168 |
+
per_device_eval_batch_size=batch_size,
|
169 |
+
num_train_epochs=num_train_epochs,
|
170 |
+
warmup_ratio=0.1,
|
171 |
+
logging_steps=5,
|
172 |
+
load_best_model_at_end=True,
|
173 |
+
metric_for_best_model="accuracy",
|
174 |
+
fp16=True,
|
175 |
+
push_to_hub=False,
|
176 |
+
)
|
177 |
+
|
178 |
+
!pip install evaluate
|
179 |
+
|
180 |
+
import evaluate
|
181 |
+
import numpy as np
|
182 |
+
|
183 |
+
metric = evaluate.load("accuracy")
|
184 |
+
|
185 |
+
|
186 |
+
def compute_metrics(eval_pred):
|
187 |
+
"""Computes accuracy on a batch of predictions"""
|
188 |
+
predictions = np.argmax(eval_pred.predictions, axis=1)
|
189 |
+
return metric.compute(predictions=predictions, references=eval_pred.label_ids)
|
190 |
+
|
191 |
+
# Now we have got all the required pieces.
|
192 |
+
# Instantiating the Trainer class and training the model.
|
193 |
+
|
194 |
+
from transformers import Trainer
|
195 |
+
|
196 |
+
trainer = Trainer(
|
197 |
+
model,
|
198 |
+
training_args,
|
199 |
+
train_dataset=gtzan_encoded["train"],
|
200 |
+
eval_dataset=gtzan_encoded["test"],
|
201 |
+
tokenizer=feature_extractor,
|
202 |
+
compute_metrics=compute_metrics,
|
203 |
+
)
|
204 |
+
|
205 |
+
trainer.train()
|
206 |
+
|
207 |
+
!pip install huggingface-cli
|
208 |
+
|
209 |
+
!huggingface-cli login
|
210 |
+
|
211 |
+
kwargs = {
|
212 |
+
"dataset_tags": "marsyas/gtzan",
|
213 |
+
"dataset": "GTZAN",
|
214 |
+
"model_name": f"{model_name}-finetuned-gtzan",
|
215 |
+
"finetuned_from": model_id,
|
216 |
+
"tasks": "audio-classification",
|
217 |
+
}
|