JiaenLiu commited on
Commit
0e0950c
·
2 Parent(s): 6bff4f1 f2c3799

Merge branch 'eason/refactor' into jiaen/batch_output

Browse files
Files changed (2) hide show
  1. SRT.py +24 -7
  2. pipeline.py +2 -1
SRT.py CHANGED
@@ -62,6 +62,16 @@ class SRT_segment(object):
62
  def get_bilingual_str(self) -> str:
63
  return f'{self.duration}\n{self.source_text}\n{self.translation}\n\n'
64
 
 
 
 
 
 
 
 
 
 
 
65
  class SRT_script():
66
  def __init__(self, segments) -> None:
67
  self.segments = []
@@ -181,8 +191,12 @@ class SRT_script():
181
  #print(lines[i])
182
  pass
183
 
184
- def split_seg(self, seg, threshold=500):
185
- # TODO: evenly split seg to 2 parts and add new seg into self.segments
 
 
 
 
186
  source_text = seg.source_text
187
  translation = seg.translation
188
  src_commas = [m.start() for m in re.finditer(',', source_text)]
@@ -191,7 +205,10 @@ class SRT_script():
191
  src_split_idx = src_commas[len(src_commas)//2] if len(src_commas) % 2 == 1 else src_commas[len(src_commas)//2 - 1]
192
  else:
193
  src_space = [m.start() for m in re.finditer(' ', source_text)]
194
- src_split_idx = src_space[len(src_space)//2] if len(src_space) % 2 == 1 else src_space[len(src_space)//2 - 1]
 
 
 
195
 
196
  if len(trans_commas) != 0:
197
  trans_split_idx = trans_commas[len(trans_commas)//2] if len(trans_commas) % 2 == 1 else trans_commas[len(trans_commas)//2 - 1]
@@ -233,8 +250,9 @@ class SRT_script():
233
  return result_list
234
 
235
 
236
- def check_len_and_split(self, threshold=30000):
237
- # TODO: if sentence length >= threshold, split this segments to two
 
238
  segments = []
239
  for seg in self.segments:
240
  if len(seg.translation) > threshold:
@@ -248,7 +266,7 @@ class SRT_script():
248
  pass
249
 
250
  def check_len_and_split_range(self, range, threshold=30):
251
- # TODO: if sentence length >= threshold, split this segments to two
252
  start_seg_id = range[0]
253
  end_seg_id = range[1]
254
  extra_len = 0
@@ -335,7 +353,6 @@ class SRT_script():
335
 
336
  def spell_check_term(self):
337
  ## known bug: I've will be replaced because i've is not in the dict
338
-
339
 
340
  import enchant
341
  dict = enchant.Dict('en_US')
 
62
  def get_bilingual_str(self) -> str:
63
  return f'{self.duration}\n{self.source_text}\n{self.translation}\n\n'
64
 
65
+ # def set_translation(self, trans):
66
+ # if trans[0] == ',':
67
+ # trans = trans[1:]
68
+ # self.translation = trans
69
+
70
+ # def set_src_text(self, src_text):
71
+ # if src_text[0] == ',':
72
+ # src_text = src_text[1:]
73
+ # self.source_text = src_text
74
+
75
  class SRT_script():
76
  def __init__(self, segments) -> None:
77
  self.segments = []
 
191
  #print(lines[i])
192
  pass
193
 
194
+ def split_seg(self, seg, threshold):
195
+ # evenly split seg to 2 parts and add new seg into self.segments
196
+ if seg.source_text[:2] == ', ':
197
+ seg.source_text = seg.source_text[2:]
198
+ if seg.translation[0] == ',':
199
+ seg.translation = seg.translation[1:]
200
  source_text = seg.source_text
201
  translation = seg.translation
202
  src_commas = [m.start() for m in re.finditer(',', source_text)]
 
205
  src_split_idx = src_commas[len(src_commas)//2] if len(src_commas) % 2 == 1 else src_commas[len(src_commas)//2 - 1]
206
  else:
207
  src_space = [m.start() for m in re.finditer(' ', source_text)]
208
+ if len(src_space) > 0:
209
+ src_split_idx = src_space[len(src_space)//2] if len(src_space) % 2 == 1 else src_space[len(src_space)//2 - 1]
210
+ else:
211
+ src_split_idx = 0
212
 
213
  if len(trans_commas) != 0:
214
  trans_split_idx = trans_commas[len(trans_commas)//2] if len(trans_commas) % 2 == 1 else trans_commas[len(trans_commas)//2 - 1]
 
250
  return result_list
251
 
252
 
253
+ def check_len_and_split(self, threshold=30):
254
+ # DEPRECATED
255
+ # if sentence length >= threshold, split this segments to two
256
  segments = []
257
  for seg in self.segments:
258
  if len(seg.translation) > threshold:
 
266
  pass
267
 
268
  def check_len_and_split_range(self, range, threshold=30):
269
+ # if sentence length >= threshold, split this segments to two
270
  start_seg_id = range[0]
271
  end_seg_id = range[1]
272
  extra_len = 0
 
353
 
354
  def spell_check_term(self):
355
  ## known bug: I've will be replaced because i've is not in the dict
 
356
 
357
  import enchant
358
  dict = enchant.Dict('en_US')
pipeline.py CHANGED
@@ -144,7 +144,7 @@ else:
144
 
145
  # srt class preprocess
146
  srt.form_whole_sentence()
147
- srt.spell_check_term()
148
  srt.correct_with_force_term()
149
  srt.write_srt_file_src(srt_file_en)
150
  script_input = srt.get_source_only()
@@ -262,6 +262,7 @@ for sentence, range in tqdm(zip(script_arr, range_arr)):
262
  flag = True
263
  # add read-time output back and modify the post-processing by using one batch as an unit.
264
  srt.set_translation(translate, range, model_name)
 
265
  add_length = srt.check_len_and_split_range(range, threshold)
266
  srt.realtime_write_srt(f"{RESULT_PATH}/{VIDEO_NAME}/{VIDEO_NAME}_zh.srt",range, add_length,segidx)
267
  # srt.realtime_bilingual_write_srt(f"{RESULT_PATH}/{VIDEO_NAME}/{VIDEO_NAME}_bi.srt",range, add_length,segidx)
 
144
 
145
  # srt class preprocess
146
  srt.form_whole_sentence()
147
+ # srt.spell_check_term()
148
  srt.correct_with_force_term()
149
  srt.write_srt_file_src(srt_file_en)
150
  script_input = srt.get_source_only()
 
262
  flag = True
263
  # add read-time output back and modify the post-processing by using one batch as an unit.
264
  srt.set_translation(translate, range, model_name)
265
+
266
  add_length = srt.check_len_and_split_range(range, threshold)
267
  srt.realtime_write_srt(f"{RESULT_PATH}/{VIDEO_NAME}/{VIDEO_NAME}_zh.srt",range, add_length,segidx)
268
  # srt.realtime_bilingual_write_srt(f"{RESULT_PATH}/{VIDEO_NAME}/{VIDEO_NAME}_bi.srt",range, add_length,segidx)