futureProofGlitch commited on
Commit
9c3919d
·
verified ·
1 Parent(s): 5edbdba

Upload audiocourse_musicgenreclassifier_p2.py

Browse files
audiocourse_musicgenreclassifier_p2.py ADDED
@@ -0,0 +1,217 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """Copy of AudioCourse_MusicGenreClassifier_P2.ipynb
3
+
4
+ Automatically generated by Colaboratory.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/19qAGS31MqX04p9EeUgAM2dGvI3SB3pm6
8
+ """
9
+
10
+ !pip install --upgrade transformers
11
+ !pip install datasets
12
+ !pip install gradio
13
+
14
+ from datasets import load_dataset
15
+
16
+ gtzan = load_dataset("marsyas/gtzan", "all")
17
+ gtzan
18
+
19
+ # GTZAN does not provide a split in the dataset, so we are creating one
20
+ # ourselves
21
+ gtzan = gtzan['train'].train_test_split(test_size=0.1, shuffle=True, seed=42)
22
+ gtzan
23
+
24
+ gtzan['train'][0]
25
+
26
+ # genre is represented as an integer, so let’s use the int2str() method of
27
+ # the genre feature to map these integers to human-readable names
28
+ id2label_fn = gtzan['train'].features['genre'].int2str
29
+ id2label_fn(gtzan['train'][0]['genre'])
30
+
31
+ # Let’s now listen to a few more examples by using Gradio to create a simple
32
+ # interface with the Blocks API
33
+
34
+ import gradio as gr
35
+
36
+
37
+ def generate_audio():
38
+ example = gtzan["train"].shuffle()[0]
39
+ audio = example["audio"]
40
+ return (
41
+ audio["sampling_rate"],
42
+ audio["array"],
43
+ ), id2label_fn(example["genre"])
44
+
45
+
46
+ with gr.Blocks() as demo:
47
+ with gr.Column():
48
+ for _ in range(4):
49
+ audio, label = generate_audio()
50
+ output = gr.Audio(audio, label=label)
51
+
52
+ demo.launch(debug=True)
53
+
54
+ from transformers import AutoFeatureExtractor
55
+
56
+ model_id = "ntu-spml/distilhubert"
57
+ feature_extractor = AutoFeatureExtractor.from_pretrained(
58
+ model_id, do_normalize=True, return_attention_mask=True
59
+ )
60
+
61
+ # As we have seen above, the sampling rate of the audio samples
62
+ # in the dataset is = 22KHz(approx.) let's find the sampling rate accepted by the
63
+ # model
64
+ sampling_rate = feature_extractor.sampling_rate
65
+ sampling_rate
66
+
67
+ # The model needs 16Khz samples so we can use the cast_column() method to
68
+ # downsample the examples to match the requirements of the model.
69
+
70
+ from datasets import Audio
71
+
72
+ gtzan = gtzan.cast_column("audio", Audio(sampling_rate=sampling_rate))
73
+
74
+ # Let's verify if the changes were successful.
75
+ gtzan['train'][0]
76
+
77
+ # Works! However, I noticed that the 1-D NP Array of the audio has changed.
78
+ # What exactly did the feature extractor do?
79
+ # Looking into this
80
+
81
+ import numpy as np
82
+ test_sample = gtzan['train'][0]['audio']
83
+ print(f"Mean: {np.mean(test_sample['array']):.3},\n",
84
+ f"Variance: {np.var(test_sample['array']):.3}")
85
+
86
+ inputs = feature_extractor(test_sample["array"],
87
+ sampling_rate=test_sample["sampling_rate"])
88
+
89
+ print(f"inputs keys: {list(inputs.keys())}")
90
+
91
+ print(
92
+ f"Mean: {np.mean(inputs['input_values']):.3}, Variance: {np.var(inputs['input_values']):.3}"
93
+ )
94
+
95
+ # the model cannot process audio samples above 30secs,
96
+ # therefore, wee need to trucate examples that have longer durations.
97
+ # Let's define a method to set the max_duration and use the feature_extractor
98
+ # class on a single sample.
99
+ # Later, we can use the .map() method to apply the same for all samples.
100
+
101
+ max_duration = 30.0
102
+
103
+
104
+ def preprocess_function(examples):
105
+ audio_arrays = [x["array"] for x in examples["audio"]]
106
+ inputs = feature_extractor(
107
+ audio_arrays,
108
+ sampling_rate=feature_extractor.sampling_rate,
109
+ max_length=int(feature_extractor.sampling_rate * max_duration),
110
+ truncation=True,
111
+ return_attention_mask=True,
112
+ )
113
+ return inputs
114
+
115
+ gtzan_encoded = gtzan.map(
116
+ preprocess_function,
117
+ remove_columns=["audio", "file"],
118
+ batched=True,
119
+ batch_size=100,
120
+ num_proc=1,
121
+ )
122
+ gtzan_encoded
123
+
124
+ gtzan_encoded = gtzan_encoded.rename_column("genre", "label")
125
+
126
+ id2label = {
127
+ str(i): id2label_fn(i)
128
+ for i in range(len(gtzan_encoded["train"].features["label"].names))
129
+ }
130
+ label2id = {v: k for k, v in id2label.items()}
131
+
132
+ id2label["7"]
133
+
134
+ # Begin fine tuning the model
135
+
136
+ from transformers import AutoModelForAudioClassification
137
+
138
+ num_labels = len(id2label)
139
+
140
+ model = AutoModelForAudioClassification.from_pretrained(
141
+ model_id,
142
+ num_labels=num_labels,
143
+ label2id=label2id,
144
+ id2label=id2label,
145
+ )
146
+
147
+ from huggingface_hub import notebook_login
148
+
149
+ notebook_login()
150
+
151
+ !pip install transformers[torch]
152
+ !pip install accelerate -U
153
+
154
+ from transformers import TrainingArguments
155
+
156
+ model_name = model_id.split("/")[-1]
157
+ batch_size = 8
158
+ gradient_accumulation_steps = 1
159
+ num_train_epochs = 10
160
+
161
+ training_args = TrainingArguments(
162
+ f"{model_name}-finetuned-gtzan",
163
+ evaluation_strategy="epoch",
164
+ save_strategy="epoch",
165
+ learning_rate=5e-5,
166
+ per_device_train_batch_size=batch_size,
167
+ gradient_accumulation_steps=gradient_accumulation_steps,
168
+ per_device_eval_batch_size=batch_size,
169
+ num_train_epochs=num_train_epochs,
170
+ warmup_ratio=0.1,
171
+ logging_steps=5,
172
+ load_best_model_at_end=True,
173
+ metric_for_best_model="accuracy",
174
+ fp16=True,
175
+ push_to_hub=False,
176
+ )
177
+
178
+ !pip install evaluate
179
+
180
+ import evaluate
181
+ import numpy as np
182
+
183
+ metric = evaluate.load("accuracy")
184
+
185
+
186
+ def compute_metrics(eval_pred):
187
+ """Computes accuracy on a batch of predictions"""
188
+ predictions = np.argmax(eval_pred.predictions, axis=1)
189
+ return metric.compute(predictions=predictions, references=eval_pred.label_ids)
190
+
191
+ # Now we have got all the required pieces.
192
+ # Instantiating the Trainer class and training the model.
193
+
194
+ from transformers import Trainer
195
+
196
+ trainer = Trainer(
197
+ model,
198
+ training_args,
199
+ train_dataset=gtzan_encoded["train"],
200
+ eval_dataset=gtzan_encoded["test"],
201
+ tokenizer=feature_extractor,
202
+ compute_metrics=compute_metrics,
203
+ )
204
+
205
+ trainer.train()
206
+
207
+ !pip install huggingface-cli
208
+
209
+ !huggingface-cli login
210
+
211
+ kwargs = {
212
+ "dataset_tags": "marsyas/gtzan",
213
+ "dataset": "GTZAN",
214
+ "model_name": f"{model_name}-finetuned-gtzan",
215
+ "finetuned_from": model_id,
216
+ "tasks": "audio-classification",
217
+ }