Spaces:
Sleeping
Sleeping
Merge branch 'eason/refactor' into jiaen/batch_output
Browse files- SRT.py +24 -7
- pipeline.py +2 -1
SRT.py
CHANGED
@@ -62,6 +62,16 @@ class SRT_segment(object):
|
|
62 |
def get_bilingual_str(self) -> str:
|
63 |
return f'{self.duration}\n{self.source_text}\n{self.translation}\n\n'
|
64 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
65 |
class SRT_script():
|
66 |
def __init__(self, segments) -> None:
|
67 |
self.segments = []
|
@@ -181,8 +191,12 @@ class SRT_script():
|
|
181 |
#print(lines[i])
|
182 |
pass
|
183 |
|
184 |
-
def split_seg(self, seg, threshold
|
185 |
-
#
|
|
|
|
|
|
|
|
|
186 |
source_text = seg.source_text
|
187 |
translation = seg.translation
|
188 |
src_commas = [m.start() for m in re.finditer(',', source_text)]
|
@@ -191,7 +205,10 @@ class SRT_script():
|
|
191 |
src_split_idx = src_commas[len(src_commas)//2] if len(src_commas) % 2 == 1 else src_commas[len(src_commas)//2 - 1]
|
192 |
else:
|
193 |
src_space = [m.start() for m in re.finditer(' ', source_text)]
|
194 |
-
|
|
|
|
|
|
|
195 |
|
196 |
if len(trans_commas) != 0:
|
197 |
trans_split_idx = trans_commas[len(trans_commas)//2] if len(trans_commas) % 2 == 1 else trans_commas[len(trans_commas)//2 - 1]
|
@@ -233,8 +250,9 @@ class SRT_script():
|
|
233 |
return result_list
|
234 |
|
235 |
|
236 |
-
def check_len_and_split(self, threshold=
|
237 |
-
#
|
|
|
238 |
segments = []
|
239 |
for seg in self.segments:
|
240 |
if len(seg.translation) > threshold:
|
@@ -248,7 +266,7 @@ class SRT_script():
|
|
248 |
pass
|
249 |
|
250 |
def check_len_and_split_range(self, range, threshold=30):
|
251 |
-
#
|
252 |
start_seg_id = range[0]
|
253 |
end_seg_id = range[1]
|
254 |
extra_len = 0
|
@@ -335,7 +353,6 @@ class SRT_script():
|
|
335 |
|
336 |
def spell_check_term(self):
|
337 |
## known bug: I've will be replaced because i've is not in the dict
|
338 |
-
|
339 |
|
340 |
import enchant
|
341 |
dict = enchant.Dict('en_US')
|
|
|
62 |
def get_bilingual_str(self) -> str:
|
63 |
return f'{self.duration}\n{self.source_text}\n{self.translation}\n\n'
|
64 |
|
65 |
+
# def set_translation(self, trans):
|
66 |
+
# if trans[0] == ',':
|
67 |
+
# trans = trans[1:]
|
68 |
+
# self.translation = trans
|
69 |
+
|
70 |
+
# def set_src_text(self, src_text):
|
71 |
+
# if src_text[0] == ',':
|
72 |
+
# src_text = src_text[1:]
|
73 |
+
# self.source_text = src_text
|
74 |
+
|
75 |
class SRT_script():
|
76 |
def __init__(self, segments) -> None:
|
77 |
self.segments = []
|
|
|
191 |
#print(lines[i])
|
192 |
pass
|
193 |
|
194 |
+
def split_seg(self, seg, threshold):
|
195 |
+
# evenly split seg to 2 parts and add new seg into self.segments
|
196 |
+
if seg.source_text[:2] == ', ':
|
197 |
+
seg.source_text = seg.source_text[2:]
|
198 |
+
if seg.translation[0] == ',':
|
199 |
+
seg.translation = seg.translation[1:]
|
200 |
source_text = seg.source_text
|
201 |
translation = seg.translation
|
202 |
src_commas = [m.start() for m in re.finditer(',', source_text)]
|
|
|
205 |
src_split_idx = src_commas[len(src_commas)//2] if len(src_commas) % 2 == 1 else src_commas[len(src_commas)//2 - 1]
|
206 |
else:
|
207 |
src_space = [m.start() for m in re.finditer(' ', source_text)]
|
208 |
+
if len(src_space) > 0:
|
209 |
+
src_split_idx = src_space[len(src_space)//2] if len(src_space) % 2 == 1 else src_space[len(src_space)//2 - 1]
|
210 |
+
else:
|
211 |
+
src_split_idx = 0
|
212 |
|
213 |
if len(trans_commas) != 0:
|
214 |
trans_split_idx = trans_commas[len(trans_commas)//2] if len(trans_commas) % 2 == 1 else trans_commas[len(trans_commas)//2 - 1]
|
|
|
250 |
return result_list
|
251 |
|
252 |
|
253 |
+
def check_len_and_split(self, threshold=30):
|
254 |
+
# DEPRECATED
|
255 |
+
# if sentence length >= threshold, split this segments to two
|
256 |
segments = []
|
257 |
for seg in self.segments:
|
258 |
if len(seg.translation) > threshold:
|
|
|
266 |
pass
|
267 |
|
268 |
def check_len_and_split_range(self, range, threshold=30):
|
269 |
+
# if sentence length >= threshold, split this segments to two
|
270 |
start_seg_id = range[0]
|
271 |
end_seg_id = range[1]
|
272 |
extra_len = 0
|
|
|
353 |
|
354 |
def spell_check_term(self):
|
355 |
## known bug: I've will be replaced because i've is not in the dict
|
|
|
356 |
|
357 |
import enchant
|
358 |
dict = enchant.Dict('en_US')
|
pipeline.py
CHANGED
@@ -144,7 +144,7 @@ else:
|
|
144 |
|
145 |
# srt class preprocess
|
146 |
srt.form_whole_sentence()
|
147 |
-
srt.spell_check_term()
|
148 |
srt.correct_with_force_term()
|
149 |
srt.write_srt_file_src(srt_file_en)
|
150 |
script_input = srt.get_source_only()
|
@@ -262,6 +262,7 @@ for sentence, range in tqdm(zip(script_arr, range_arr)):
|
|
262 |
flag = True
|
263 |
# add read-time output back and modify the post-processing by using one batch as an unit.
|
264 |
srt.set_translation(translate, range, model_name)
|
|
|
265 |
add_length = srt.check_len_and_split_range(range, threshold)
|
266 |
srt.realtime_write_srt(f"{RESULT_PATH}/{VIDEO_NAME}/{VIDEO_NAME}_zh.srt",range, add_length,segidx)
|
267 |
# srt.realtime_bilingual_write_srt(f"{RESULT_PATH}/{VIDEO_NAME}/{VIDEO_NAME}_bi.srt",range, add_length,segidx)
|
|
|
144 |
|
145 |
# srt class preprocess
|
146 |
srt.form_whole_sentence()
|
147 |
+
# srt.spell_check_term()
|
148 |
srt.correct_with_force_term()
|
149 |
srt.write_srt_file_src(srt_file_en)
|
150 |
script_input = srt.get_source_only()
|
|
|
262 |
flag = True
|
263 |
# add read-time output back and modify the post-processing by using one batch as an unit.
|
264 |
srt.set_translation(translate, range, model_name)
|
265 |
+
|
266 |
add_length = srt.check_len_and_split_range(range, threshold)
|
267 |
srt.realtime_write_srt(f"{RESULT_PATH}/{VIDEO_NAME}/{VIDEO_NAME}_zh.srt",range, add_length,segidx)
|
268 |
# srt.realtime_bilingual_write_srt(f"{RESULT_PATH}/{VIDEO_NAME}/{VIDEO_NAME}_bi.srt",range, add_length,segidx)
|