Spaces:

StarPigeon
/

ViDove

Sleeping

App Files Files Community

JiaenLiu commited on Mar 28, 2023

Commit

0e0950c

2 Parent(s): 6bff4f1 f2c3799

Merge branch 'eason/refactor' into jiaen/batch_output

Browse files

Files changed (2) hide show

SRT.py +24 -7
pipeline.py +2 -1

SRT.py CHANGED Viewed

@@ -62,6 +62,16 @@ class SRT_segment(object):
     def get_bilingual_str(self) -> str:
         return f'{self.duration}\n{self.source_text}\n{self.translation}\n\n'
 class SRT_script():
     def __init__(self, segments) -> None:
         self.segments = []
@@ -181,8 +191,12 @@ class SRT_script():
                     #print(lines[i])
         pass
-    def split_seg(self, seg, threshold=500):
-        # TODO: evenly split seg to 2 parts and add new seg into self.segments
         source_text = seg.source_text
         translation = seg.translation
         src_commas = [m.start() for m in re.finditer(',', source_text)]
@@ -191,7 +205,10 @@ class SRT_script():
             src_split_idx = src_commas[len(src_commas)//2] if len(src_commas) % 2 == 1 else src_commas[len(src_commas)//2 - 1]
         else:
             src_space = [m.start() for m in re.finditer(' ', source_text)]
-            src_split_idx = src_space[len(src_space)//2] if len(src_space) % 2 == 1 else src_space[len(src_space)//2 - 1]
         if len(trans_commas) != 0:
             trans_split_idx = trans_commas[len(trans_commas)//2] if len(trans_commas) % 2 == 1 else trans_commas[len(trans_commas)//2 - 1]
@@ -233,8 +250,9 @@ class SRT_script():
         return result_list
-    def check_len_and_split(self, threshold=30000):
-        # TODO: if sentence length >= threshold, split this segments to two
         segments = []
         for seg in self.segments:
             if len(seg.translation) > threshold:
@@ -248,7 +266,7 @@ class SRT_script():
         pass
     def check_len_and_split_range(self, range, threshold=30):
-        # TODO: if sentence length >= threshold, split this segments to two
         start_seg_id = range[0]
         end_seg_id = range[1]
         extra_len = 0
@@ -335,7 +353,6 @@ class SRT_script():
     def spell_check_term(self):
         ## known bug: I've will be replaced because i've is not in the dict
         import enchant
         dict = enchant.Dict('en_US')

     def get_bilingual_str(self) -> str:
         return f'{self.duration}\n{self.source_text}\n{self.translation}\n\n'
+    # def set_translation(self, trans):
+    #     if trans[0] == '，':
+    #         trans = trans[1:]
+    #     self.translation = trans
+    # def set_src_text(self, src_text):
+    #     if src_text[0] == ',':
+    #         src_text = src_text[1:]
+    #     self.source_text = src_text
 class SRT_script():
     def __init__(self, segments) -> None:
         self.segments = []
                     #print(lines[i])
         pass
+    def split_seg(self, seg, threshold):
+        # evenly split seg to 2 parts and add new seg into self.segments
+        if seg.source_text[:2] == ', ':
+            seg.source_text = seg.source_text[2:]
+        if seg.translation[0] == '，':
+            seg.translation = seg.translation[1:]
         source_text = seg.source_text
         translation = seg.translation
         src_commas = [m.start() for m in re.finditer(',', source_text)]
             src_split_idx = src_commas[len(src_commas)//2] if len(src_commas) % 2 == 1 else src_commas[len(src_commas)//2 - 1]
         else:
             src_space = [m.start() for m in re.finditer(' ', source_text)]
+            if len(src_space) > 0:
+                src_split_idx = src_space[len(src_space)//2] if len(src_space) % 2 == 1 else src_space[len(src_space)//2 - 1]
+            else:
+                src_split_idx = 0
         if len(trans_commas) != 0:
             trans_split_idx = trans_commas[len(trans_commas)//2] if len(trans_commas) % 2 == 1 else trans_commas[len(trans_commas)//2 - 1]
         return result_list
+    def check_len_and_split(self, threshold=30):
+        # DEPRECATED
+        # if sentence length >= threshold, split this segments to two
         segments = []
         for seg in self.segments:
             if len(seg.translation) > threshold:
         pass
     def check_len_and_split_range(self, range, threshold=30):
+        # if sentence length >= threshold, split this segments to two
         start_seg_id = range[0]
         end_seg_id = range[1]
         extra_len = 0
     def spell_check_term(self):
         ## known bug: I've will be replaced because i've is not in the dict
         import enchant
         dict = enchant.Dict('en_US')

pipeline.py CHANGED Viewed

@@ -144,7 +144,7 @@ else:
 # srt class preprocess
 srt.form_whole_sentence()
-srt.spell_check_term()
 srt.correct_with_force_term()
 srt.write_srt_file_src(srt_file_en)
 script_input = srt.get_source_only()
@@ -262,6 +262,7 @@ for sentence, range in tqdm(zip(script_arr, range_arr)):
             flag = True
     # add read-time output back and modify the post-processing by using one batch as an unit.
     srt.set_translation(translate, range, model_name)
     add_length = srt.check_len_and_split_range(range, threshold)
     srt.realtime_write_srt(f"{RESULT_PATH}/{VIDEO_NAME}/{VIDEO_NAME}_zh.srt",range, add_length,segidx)
     # srt.realtime_bilingual_write_srt(f"{RESULT_PATH}/{VIDEO_NAME}/{VIDEO_NAME}_bi.srt",range, add_length,segidx)

 # srt class preprocess
 srt.form_whole_sentence()
+# srt.spell_check_term()
 srt.correct_with_force_term()
 srt.write_srt_file_src(srt_file_en)
 script_input = srt.get_source_only()
             flag = True
     # add read-time output back and modify the post-processing by using one batch as an unit.
     srt.set_translation(translate, range, model_name)
     add_length = srt.check_len_and_split_range(range, threshold)
     srt.realtime_write_srt(f"{RESULT_PATH}/{VIDEO_NAME}/{VIDEO_NAME}_zh.srt",range, add_length,segidx)
     # srt.realtime_bilingual_write_srt(f"{RESULT_PATH}/{VIDEO_NAME}/{VIDEO_NAME}_bi.srt",range, add_length,segidx)