import fileinput import os import platform from subprocess import Popen, PIPE # Class to align original and translated sentences # based on https://github.com/mtuoc/MTUOC-server/blob/main/GetWordAlignments_fast_align.py class Aligner(): def __init__(self, config_folder, source_lang, target_lang, temp_folder): forward_params_path = os.path.join(config_folder, f"{source_lang}-{target_lang}.params") reverse_params_path = os.path.join(config_folder, f"{target_lang}-{source_lang}.params") fwd_T, fwd_m = self.__read_err(os.path.join(config_folder, f"{source_lang}-{target_lang}.err")) rev_T, rev_m = self.__read_err(os.path.join(config_folder, f"{target_lang}-{source_lang}.err")) self.forward_alignment_file_path = os.path.join(temp_folder, "forward.align") self.reverse_alignment_file_path = os.path.join(temp_folder, "reverse.align") if platform.system().lower() == "windows": fastalign_bin = "fast_align.exe" atools_bin = "atools.exe" else: fastalign_bin = "./fast_align" atools_bin = "./atools" self.temp_file_path = os.path.join(temp_folder, "tokenized_sentences_to_align.txt") self.forward_command = [fastalign_bin, "-i", self.temp_file_path, "-d", "-T", fwd_T, "-m", fwd_m, "-f", forward_params_path] self.reverse_command = [fastalign_bin, "-i", self.temp_file_path, "-d", "-T", rev_T, "-m", rev_m, "-f", reverse_params_path, "r"] self.symmetric_command = [atools_bin, "-i", self.forward_alignment_file_path, "-j", self.reverse_alignment_file_path, "-c", "grow-diag-final-and"] def __simplify_alignment_file(self, file): with fileinput.FileInput(file, inplace=True, backup='.bak') as f: for line in f: print(line.split('|||')[2].strip()) def __read_err(self, err): (T, m) = ('', '') for line in open(err): # expected target length = source length * N if 'expected target length' in line: m = line.split()[-1] # final tension: N elif 'final tension' in line: T = line.split()[-1] return T, m def align(self, original_sentences, translated_sentences): # create temporary file which fastalign will use with open(self.temp_file_path, "w") as temp_file: for original, translated in zip(original_sentences, translated_sentences): temp_file.write(f"{original} ||| {translated}\n") # generate forward alignment with open(self.forward_alignment_file_path, 'w') as f_out, open(self.reverse_alignment_file_path, 'w') as r_out: fw_process = Popen(self.forward_command, stdout=f_out) # generate reverse alignment r_process = Popen(self.reverse_command, stdout=r_out) # wait for both to finish fw_process.wait() r_process.wait() # for some reason the output file contains more information than needed, remove it self.__simplify_alignment_file(self.forward_alignment_file_path) self.__simplify_alignment_file(self.reverse_alignment_file_path) # generate symmetrical alignment process = Popen(self.symmetric_command, stdin=PIPE, stdout=PIPE, stderr=PIPE) process.wait() # get final alignments and format them alignments_str = process.communicate()[0].decode('utf-8') alignments = [] for line in alignments_str.splitlines(): alignments.append([(int(i), int(j)) for i, j in [pair.split("-") for pair in line.strip("\n").split(" ")]]) return alignments