Navid Arabi commited on
Commit
d86a872
·
1 Parent(s): d271760

add seed func

Browse files
Files changed (4) hide show
  1. app.py +5 -3
  2. config.py +4 -1
  3. import_db.py +0 -71
  4. seed_db.py +52 -0
app.py CHANGED
@@ -1,7 +1,9 @@
1
  import gradio as gr
2
 
3
- def greet(name):
4
- return name
5
 
6
- demo = gr.Interface(fn=greet, outputs="text")
 
 
 
7
  demo.launch()
 
1
  import gradio as gr
2
 
3
+ from seed_db import seed
 
4
 
5
+ def run():
6
+ return seed()
7
+
8
+ demo = gr.Interface(fn=run, outputs="text")
9
  demo.launch()
config.py CHANGED
@@ -6,4 +6,7 @@ db_config = {
6
  'password': os.environ.get('DB_PASSWORD'),
7
  'host': os.environ.get('DB_HOST'),
8
  'database': os.environ.get('DB_NAME')
9
- }
 
 
 
 
6
  'password': os.environ.get('DB_PASSWORD'),
7
  'host': os.environ.get('DB_HOST'),
8
  'database': os.environ.get('DB_NAME')
9
+ }
10
+
11
+ hf_token = os.environ.get('HF_TOKEN')
12
+ hf_tts_ds_repo = os.environ.get('navidved/channelb-raw-data')
import_db.py DELETED
@@ -1,71 +0,0 @@
1
- from datasets import load_dataset
2
- from huggingface_hub import login
3
- import os
4
-
5
- token = os.environ.get('HF_TOKEN')
6
-
7
- login(token=token)
8
-
9
- dataset = load_dataset("navidved/channelb-raw-data", split="train", trust_remote_code=True)
10
-
11
-
12
- print(dataset.column_names)
13
- print(dataset[0])
14
-
15
-
16
-
17
- import mysql.connector
18
-
19
- # اتصال به دیتابیس (مقدارها رو بر اساس نیاز ست کن)
20
- conn = mysql.connector.connect(
21
- host="",
22
- user="",
23
- password="",
24
- database="",
25
- port=32107,
26
- charset="utf8mb4",
27
- use_unicode=True
28
- )
29
-
30
- cursor = conn.cursor()
31
-
32
- # ساخت جدول اگر وجود نداشت
33
- cursor.execute("""
34
- CREATE TABLE IF NOT EXISTS tts_data (
35
- id INT AUTO_INCREMENT PRIMARY KEY,
36
- filename VARCHAR(255),
37
- sentence TEXT
38
- )
39
- """)
40
-
41
- # تعداد رکورد در هر batch
42
- batch_size = 1000
43
- batch = []
44
-
45
- for i, item in enumerate(dataset):
46
- filename = f"sample_{i}.wav"
47
- sentence = item["sentence"]
48
- batch.append((filename, sentence))
49
-
50
- # وقتی batch کامل شد
51
- if len(batch) == batch_size:
52
- cursor.executemany(
53
- "INSERT INTO tts_data (filename, sentence) VALUES (%s, %s)", batch
54
- )
55
- conn.commit()
56
- print(f"✅ {i + 1} رکورد تا الان ثبت شد")
57
- batch = []
58
-
59
- # ثبت رکوردهای باقیمانده (کمتر از batch_size)
60
- if batch:
61
- cursor.executemany(
62
- "INSERT INTO tts_data (filename, sentence) VALUES (%s, %s)", batch
63
- )
64
- conn.commit()
65
- print(f"✅ آخرین {len(batch)} رکورد هم ثبت شد")
66
-
67
- # بستن ارتباط
68
- cursor.close()
69
- conn.close()
70
-
71
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
seed_db.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import mysql.connector
2
+ from datasets import load_dataset
3
+ from huggingface_hub import login
4
+ import config
5
+
6
+
7
+ def seed():
8
+ login(token=config.hf_token)
9
+ dataset = load_dataset(config.hf_tts_ds_repo, split="train", trust_remote_code=True)
10
+
11
+ print(dataset.column_names)
12
+ print(dataset[0])
13
+
14
+ conn = mysql.connector.connect(config.db_config)
15
+ cursor = conn.cursor()
16
+
17
+ cursor.execute(
18
+ """
19
+ CREATE TABLE IF NOT EXISTS tts_data (
20
+ id INT AUTO_INCREMENT PRIMARY KEY,
21
+ filename VARCHAR(255),
22
+ sentence TEXT
23
+ )
24
+ """
25
+ )
26
+
27
+ batch_size = 1000
28
+ batch = []
29
+
30
+ for i, item in enumerate(dataset):
31
+ filename = f"sample_{i}.wav"
32
+ sentence = item["sentence"]
33
+ batch.append((filename, sentence))
34
+
35
+ if len(batch) == batch_size:
36
+ cursor.executemany(
37
+ "INSERT INTO tts_data (filename, sentence) VALUES (%s, %s)", batch
38
+ )
39
+ conn.commit()
40
+ print(f"✅ {i + 1} records saved!")
41
+ batch = []
42
+
43
+ if batch:
44
+ cursor.executemany(
45
+ "INSERT INTO tts_data (filename, sentence) VALUES (%s, %s)", batch
46
+ )
47
+ conn.commit()
48
+ print(f"✅ last {len(batch)} records saved.")
49
+
50
+ cursor.close()
51
+ conn.close()
52
+ return "done!"