Spaces:
Running
Running
Navid Arabi
commited on
Commit
·
d86a872
1
Parent(s):
d271760
add seed func
Browse files- app.py +5 -3
- config.py +4 -1
- import_db.py +0 -71
- seed_db.py +52 -0
app.py
CHANGED
@@ -1,7 +1,9 @@
|
|
1 |
import gradio as gr
|
2 |
|
3 |
-
|
4 |
-
return name
|
5 |
|
6 |
-
|
|
|
|
|
|
|
7 |
demo.launch()
|
|
|
1 |
import gradio as gr
|
2 |
|
3 |
+
from seed_db import seed
|
|
|
4 |
|
5 |
+
def run():
|
6 |
+
return seed()
|
7 |
+
|
8 |
+
demo = gr.Interface(fn=run, outputs="text")
|
9 |
demo.launch()
|
config.py
CHANGED
@@ -6,4 +6,7 @@ db_config = {
|
|
6 |
'password': os.environ.get('DB_PASSWORD'),
|
7 |
'host': os.environ.get('DB_HOST'),
|
8 |
'database': os.environ.get('DB_NAME')
|
9 |
-
}
|
|
|
|
|
|
|
|
6 |
'password': os.environ.get('DB_PASSWORD'),
|
7 |
'host': os.environ.get('DB_HOST'),
|
8 |
'database': os.environ.get('DB_NAME')
|
9 |
+
}
|
10 |
+
|
11 |
+
hf_token = os.environ.get('HF_TOKEN')
|
12 |
+
hf_tts_ds_repo = os.environ.get('navidved/channelb-raw-data')
|
import_db.py
DELETED
@@ -1,71 +0,0 @@
|
|
1 |
-
from datasets import load_dataset
|
2 |
-
from huggingface_hub import login
|
3 |
-
import os
|
4 |
-
|
5 |
-
token = os.environ.get('HF_TOKEN')
|
6 |
-
|
7 |
-
login(token=token)
|
8 |
-
|
9 |
-
dataset = load_dataset("navidved/channelb-raw-data", split="train", trust_remote_code=True)
|
10 |
-
|
11 |
-
|
12 |
-
print(dataset.column_names)
|
13 |
-
print(dataset[0])
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
import mysql.connector
|
18 |
-
|
19 |
-
# اتصال به دیتابیس (مقدارها رو بر اساس نیاز ست کن)
|
20 |
-
conn = mysql.connector.connect(
|
21 |
-
host="",
|
22 |
-
user="",
|
23 |
-
password="",
|
24 |
-
database="",
|
25 |
-
port=32107,
|
26 |
-
charset="utf8mb4",
|
27 |
-
use_unicode=True
|
28 |
-
)
|
29 |
-
|
30 |
-
cursor = conn.cursor()
|
31 |
-
|
32 |
-
# ساخت جدول اگر وجود نداشت
|
33 |
-
cursor.execute("""
|
34 |
-
CREATE TABLE IF NOT EXISTS tts_data (
|
35 |
-
id INT AUTO_INCREMENT PRIMARY KEY,
|
36 |
-
filename VARCHAR(255),
|
37 |
-
sentence TEXT
|
38 |
-
)
|
39 |
-
""")
|
40 |
-
|
41 |
-
# تعداد رکورد در هر batch
|
42 |
-
batch_size = 1000
|
43 |
-
batch = []
|
44 |
-
|
45 |
-
for i, item in enumerate(dataset):
|
46 |
-
filename = f"sample_{i}.wav"
|
47 |
-
sentence = item["sentence"]
|
48 |
-
batch.append((filename, sentence))
|
49 |
-
|
50 |
-
# وقتی batch کامل شد
|
51 |
-
if len(batch) == batch_size:
|
52 |
-
cursor.executemany(
|
53 |
-
"INSERT INTO tts_data (filename, sentence) VALUES (%s, %s)", batch
|
54 |
-
)
|
55 |
-
conn.commit()
|
56 |
-
print(f"✅ {i + 1} رکورد تا الان ثبت شد")
|
57 |
-
batch = []
|
58 |
-
|
59 |
-
# ثبت رکوردهای باقیمانده (کمتر از batch_size)
|
60 |
-
if batch:
|
61 |
-
cursor.executemany(
|
62 |
-
"INSERT INTO tts_data (filename, sentence) VALUES (%s, %s)", batch
|
63 |
-
)
|
64 |
-
conn.commit()
|
65 |
-
print(f"✅ آخرین {len(batch)} رکورد هم ثبت شد")
|
66 |
-
|
67 |
-
# بستن ارتباط
|
68 |
-
cursor.close()
|
69 |
-
conn.close()
|
70 |
-
|
71 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
seed_db.py
ADDED
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import mysql.connector
|
2 |
+
from datasets import load_dataset
|
3 |
+
from huggingface_hub import login
|
4 |
+
import config
|
5 |
+
|
6 |
+
|
7 |
+
def seed():
|
8 |
+
login(token=config.hf_token)
|
9 |
+
dataset = load_dataset(config.hf_tts_ds_repo, split="train", trust_remote_code=True)
|
10 |
+
|
11 |
+
print(dataset.column_names)
|
12 |
+
print(dataset[0])
|
13 |
+
|
14 |
+
conn = mysql.connector.connect(config.db_config)
|
15 |
+
cursor = conn.cursor()
|
16 |
+
|
17 |
+
cursor.execute(
|
18 |
+
"""
|
19 |
+
CREATE TABLE IF NOT EXISTS tts_data (
|
20 |
+
id INT AUTO_INCREMENT PRIMARY KEY,
|
21 |
+
filename VARCHAR(255),
|
22 |
+
sentence TEXT
|
23 |
+
)
|
24 |
+
"""
|
25 |
+
)
|
26 |
+
|
27 |
+
batch_size = 1000
|
28 |
+
batch = []
|
29 |
+
|
30 |
+
for i, item in enumerate(dataset):
|
31 |
+
filename = f"sample_{i}.wav"
|
32 |
+
sentence = item["sentence"]
|
33 |
+
batch.append((filename, sentence))
|
34 |
+
|
35 |
+
if len(batch) == batch_size:
|
36 |
+
cursor.executemany(
|
37 |
+
"INSERT INTO tts_data (filename, sentence) VALUES (%s, %s)", batch
|
38 |
+
)
|
39 |
+
conn.commit()
|
40 |
+
print(f"✅ {i + 1} records saved!")
|
41 |
+
batch = []
|
42 |
+
|
43 |
+
if batch:
|
44 |
+
cursor.executemany(
|
45 |
+
"INSERT INTO tts_data (filename, sentence) VALUES (%s, %s)", batch
|
46 |
+
)
|
47 |
+
conn.commit()
|
48 |
+
print(f"✅ last {len(batch)} records saved.")
|
49 |
+
|
50 |
+
cursor.close()
|
51 |
+
conn.close()
|
52 |
+
return "done!"
|