mjuvilla commited on
Commit
bc3b289
·
1 Parent(s): 6e54822

Updated readme and added salamandraTA7b translator class

Browse files
readme.md CHANGED
@@ -1,6 +1,6 @@
1
  # document_translator
2
 
3
- Project to translate files (for now .docx) using BSC's models while keeping the formatting and style of the original file.
4
 
5
  ## Requirements
6
  ### python 3.12
@@ -16,3 +16,12 @@ I took the 4 files (ca-en.params, ca-en.err, en-ca.params and en-ca.err) from ht
16
  ### python requirements
17
 
18
  pip install -r requirements.txt
 
 
 
 
 
 
 
 
 
 
1
  # document_translator
2
 
3
+ Project to translate files using BSC's models while keeping the formatting and style of the original file.
4
 
5
  ## Requirements
6
  ### python 3.12
 
16
  ### python requirements
17
 
18
  pip install -r requirements.txt
19
+
20
+ ### mtuoc_aina_translator
21
+
22
+ To use this class you also need to be running MTUOC's translation server with the proper translation models. There's also no
23
+ need to use fastalign on that side since the current project already runs it.
24
+
25
+ ### salamandrata7b_translator
26
+
27
+ Class that uses huggingface's demo.
src/mtuoc_aina_translator.py CHANGED
@@ -1,20 +1,19 @@
1
  import requests
2
  import json
3
- from nltk.tokenize import sent_tokenize
4
 
5
  class MTUOCAinaTranslator:
6
  def __init__(self, ip: str, port: str):
7
  self.ip = ip
8
  self.port = port
9
 
10
- def translate(self, text):
11
- stuff = sent_tokenize(text)
12
-
13
  myobj = {
14
  'id': '1',
15
  'src': text,
16
  }
17
- url = 'http://' + self.ip + ':' + self.port + '/translate'
 
18
  x = requests.post(url, json=myobj)
19
  json_response = json.loads(x.text)
20
  return json_response['tgt']
 
1
  import requests
2
  import json
3
+
4
 
5
  class MTUOCAinaTranslator:
6
  def __init__(self, ip: str, port: str):
7
  self.ip = ip
8
  self.port = port
9
 
10
+ def translate(self, text, source_lang=None, target_lang=None):
 
 
11
  myobj = {
12
  'id': '1',
13
  'src': text,
14
  }
15
+ url = f'http://{self.ip}:{self.port}/translate'
16
+ #url = 'http://' + self.ip + ':' + self.port + '/translate'
17
  x = requests.post(url, json=myobj)
18
  json_response = json.loads(x.text)
19
  return json_response['tgt']
src/salamandraTA7b_translator.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from gradio_client import Client
2
+ from iso639 import languages
3
+
4
+ HF_TOKEN = "YOUR-HF-TOKEN-HERE"
5
+
6
+ class SalamandraTA7bTranslator:
7
+ def __init__(self):
8
+ self.client = Client("BSC-LT/SalamandraTA-7B-Demo", hf_token=HF_TOKEN)
9
+
10
+ def translate(self, text, source_lang, target_lang):
11
+ # we assume that they are specifying the language by code so we need to convert it to name
12
+ lang1 = languages.get(alpha2=source_lang).name
13
+ lang2 = languages.get(alpha2=target_lang).name
14
+ result = self.client.predict(
15
+ task="Translation",
16
+ source=lang1,
17
+ target=lang2,
18
+ input_text=text,
19
+ mt_text=None,
20
+ api_name="/generate_output"
21
+ )
22
+ return result[0]
src/translate_any_doc.py CHANGED
@@ -271,7 +271,7 @@ def translate_document(input_file: str, source_lang: str, target_lang: str,
271
  translated_paragraphs = []
272
  for paragraph in tqdm.tqdm(paragraphs_with_runs, desc="Translating paragraphs..."):
273
  paragraph_text = detokenizer.detokenize([run["text"] for run in paragraph])
274
- translated_paragraphs.append(translator.translate(paragraph_text))
275
 
276
  # time to align the translation with the original
277
  print("Generating alignments...")
 
271
  translated_paragraphs = []
272
  for paragraph in tqdm.tqdm(paragraphs_with_runs, desc="Translating paragraphs..."):
273
  paragraph_text = detokenizer.detokenize([run["text"] for run in paragraph])
274
+ translated_paragraphs.append(translator.translate(paragraph_text, source_lang, target_lang))
275
 
276
  # time to align the translation with the original
277
  print("Generating alignments...")