Spaces:
Sleeping
Sleeping
Updated readme and added salamandraTA7b translator class
Browse files- readme.md +10 -1
- src/mtuoc_aina_translator.py +4 -5
- src/salamandraTA7b_translator.py +22 -0
- src/translate_any_doc.py +1 -1
readme.md
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
# document_translator
|
2 |
|
3 |
-
Project to translate files
|
4 |
|
5 |
## Requirements
|
6 |
### python 3.12
|
@@ -16,3 +16,12 @@ I took the 4 files (ca-en.params, ca-en.err, en-ca.params and en-ca.err) from ht
|
|
16 |
### python requirements
|
17 |
|
18 |
pip install -r requirements.txt
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
# document_translator
|
2 |
|
3 |
+
Project to translate files using BSC's models while keeping the formatting and style of the original file.
|
4 |
|
5 |
## Requirements
|
6 |
### python 3.12
|
|
|
16 |
### python requirements
|
17 |
|
18 |
pip install -r requirements.txt
|
19 |
+
|
20 |
+
### mtuoc_aina_translator
|
21 |
+
|
22 |
+
To use this class you also need to be running MTUOC's translation server with the proper translation models. There's also no
|
23 |
+
need to use fastalign on that side since the current project already runs it.
|
24 |
+
|
25 |
+
### salamandrata7b_translator
|
26 |
+
|
27 |
+
Class that uses huggingface's demo.
|
src/mtuoc_aina_translator.py
CHANGED
@@ -1,20 +1,19 @@
|
|
1 |
import requests
|
2 |
import json
|
3 |
-
|
4 |
|
5 |
class MTUOCAinaTranslator:
|
6 |
def __init__(self, ip: str, port: str):
|
7 |
self.ip = ip
|
8 |
self.port = port
|
9 |
|
10 |
-
def translate(self, text):
|
11 |
-
stuff = sent_tokenize(text)
|
12 |
-
|
13 |
myobj = {
|
14 |
'id': '1',
|
15 |
'src': text,
|
16 |
}
|
17 |
-
url = 'http://
|
|
|
18 |
x = requests.post(url, json=myobj)
|
19 |
json_response = json.loads(x.text)
|
20 |
return json_response['tgt']
|
|
|
1 |
import requests
|
2 |
import json
|
3 |
+
|
4 |
|
5 |
class MTUOCAinaTranslator:
|
6 |
def __init__(self, ip: str, port: str):
|
7 |
self.ip = ip
|
8 |
self.port = port
|
9 |
|
10 |
+
def translate(self, text, source_lang=None, target_lang=None):
|
|
|
|
|
11 |
myobj = {
|
12 |
'id': '1',
|
13 |
'src': text,
|
14 |
}
|
15 |
+
url = f'http://{self.ip}:{self.port}/translate'
|
16 |
+
#url = 'http://' + self.ip + ':' + self.port + '/translate'
|
17 |
x = requests.post(url, json=myobj)
|
18 |
json_response = json.loads(x.text)
|
19 |
return json_response['tgt']
|
src/salamandraTA7b_translator.py
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from gradio_client import Client
|
2 |
+
from iso639 import languages
|
3 |
+
|
4 |
+
HF_TOKEN = "YOUR-HF-TOKEN-HERE"
|
5 |
+
|
6 |
+
class SalamandraTA7bTranslator:
|
7 |
+
def __init__(self):
|
8 |
+
self.client = Client("BSC-LT/SalamandraTA-7B-Demo", hf_token=HF_TOKEN)
|
9 |
+
|
10 |
+
def translate(self, text, source_lang, target_lang):
|
11 |
+
# we assume that they are specifying the language by code so we need to convert it to name
|
12 |
+
lang1 = languages.get(alpha2=source_lang).name
|
13 |
+
lang2 = languages.get(alpha2=target_lang).name
|
14 |
+
result = self.client.predict(
|
15 |
+
task="Translation",
|
16 |
+
source=lang1,
|
17 |
+
target=lang2,
|
18 |
+
input_text=text,
|
19 |
+
mt_text=None,
|
20 |
+
api_name="/generate_output"
|
21 |
+
)
|
22 |
+
return result[0]
|
src/translate_any_doc.py
CHANGED
@@ -271,7 +271,7 @@ def translate_document(input_file: str, source_lang: str, target_lang: str,
|
|
271 |
translated_paragraphs = []
|
272 |
for paragraph in tqdm.tqdm(paragraphs_with_runs, desc="Translating paragraphs..."):
|
273 |
paragraph_text = detokenizer.detokenize([run["text"] for run in paragraph])
|
274 |
-
translated_paragraphs.append(translator.translate(paragraph_text))
|
275 |
|
276 |
# time to align the translation with the original
|
277 |
print("Generating alignments...")
|
|
|
271 |
translated_paragraphs = []
|
272 |
for paragraph in tqdm.tqdm(paragraphs_with_runs, desc="Translating paragraphs..."):
|
273 |
paragraph_text = detokenizer.detokenize([run["text"] for run in paragraph])
|
274 |
+
translated_paragraphs.append(translator.translate(paragraph_text, source_lang, target_lang))
|
275 |
|
276 |
# time to align the translation with the original
|
277 |
print("Generating alignments...")
|