The M2M model is trained on ~100 languages and able to translate different languages, e.g.
from transformers import pipeline
m2m100 = pipeline('translation', 'facebook/m2m100_418M', src_lang='en', tgt_lang="de")
m2m100(["hello world", "foo bar"])
[out]:
[{'translation_text': 'Hallo Welt'}, {'translation_text': 'Die Fu Bar'}]
But to enable multiple target translations, user have to initialize multiple pipelines:
from transformers import pipeline
m2m100_en_de = pipeline('translation', 'facebook/m2m100_418M', src_lang='en', tgt_lang="de")
m2m100_en_fr = pipeline('translation', 'facebook/m2m100_418M', src_lang='en', tgt_lang="fr")
print(m2m100_en_de(["hello world", "foo bar"]))
print(m2m100_en_fr(["hello world", "foo bar"]))
[out]:
[{'translation_text': 'Hallo Welt'}, {'translation_text': 'Die Fu Bar'}]
[{'translation_text': 'Bonjour Monde'}, {'translation_text': 'Le bar Fou'}]
Is there a way to use a single pipeline for multiple target languages and/or source languages for the M2M model?
I've tried this:
from transformers import pipeline
m2m100_en_defr = pipeline('translation', 'facebook/m2m100_418M', src_lang='en', tgt_lang=["de", "fr"])
print(m2m100_en_defr(["hello world", "foo bar"]))
But it throws the error:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
/tmp/ipykernel_28/3374873260.py in <module>
3 m2m100_en_defr = pipeline('translation', 'facebook/m2m100_418M', src_lang='en', tgt_lang=["de", "fr"])
4
----> 5 print(m2m100_en_defr(["hello world", "foo bar"]))
/opt/conda/lib/python3.7/site-packages/transformers/pipelines/text2text_generation.py in __call__(self, *args, **kwargs)
364 token ids of the translation.
365 """
--> 366 return super().__call__(*args, **kwargs)
/opt/conda/lib/python3.7/site-packages/transformers/pipelines/text2text_generation.py in __call__(self, *args, **kwargs)
163 """
164
--> 165 result = super().__call__(*args, **kwargs)
166 if (
167 isinstance(args[0], list)
/opt/conda/lib/python3.7/site-packages/transformers/pipelines/base.py in __call__(self, inputs, num_workers, batch_size, *args, **kwargs)
1088 inputs, num_workers, batch_size, preprocess_params, forward_params, postprocess_params
1089 )
-> 1090 outputs = list(final_iterator)
1091 return outputs
1092 else:
/opt/conda/lib/python3.7/site-packages/transformers/pipelines/pt_utils.py in __next__(self)
122
123 # We're out of items within a batch
--> 124 item = next(self.iterator)
125 processed = self.infer(item, **self.params)
126 # We now have a batch of "inferred things".
/opt/conda/lib/python3.7/site-packages/transformers/pipelines/pt_utils.py in __next__(self)
122
123 # We're out of items within a batch
--> 124 item = next(self.iterator)
125 processed = self.infer(item, **self.params)
126 # We now have a batch of "inferred things".
/opt/conda/lib/python3.7/site-packages/torch/utils/data/dataloader.py in __next__(self)
626 # TODO(https://github.com/pytorch/pytorch/issues/76750)
627 self._reset() # type: ignore[call-arg]
--> 628 data = self._next_data()
629 self._num_yielded += 1
630 if self._dataset_kind == _DatasetKind.Iterable and \
/opt/conda/lib/python3.7/site-packages/torch/utils/data/dataloader.py in _next_data(self)
669 def _next_data(self):
670 index = self._next_index() # may raise StopIteration
--> 671 data = self._dataset_fetcher.fetch(index) # may raise StopIteration
672 if self._pin_memory:
673 data = _utils.pin_memory.pin_memory(data, self._pin_memory_device)
/opt/conda/lib/python3.7/site-packages/torch/utils/data/_utils/fetch.py in fetch(self, possibly_batched_index)
56 data = self.dataset.__getitems__(possibly_batched_index)
57 else:
---> 58 data = [self.dataset[idx] for idx in possibly_batched_index]
59 else:
60 data = self.dataset[possibly_batched_index]
/opt/conda/lib/python3.7/site-packages/torch/utils/data/_utils/fetch.py in <listcomp>(.0)
56 data = self.dataset.__getitems__(possibly_batched_index)
57 else:
---> 58 data = [self.dataset[idx] for idx in possibly_batched_index]
59 else:
60 data = self.dataset[possibly_batched_index]
/opt/conda/lib/python3.7/site-packages/transformers/pipelines/pt_utils.py in __getitem__(self, i)
17 def __getitem__(self, i):
18 item = self.dataset[i]
---> 19 processed = self.process(item, **self.params)
20 return processed
21
/opt/conda/lib/python3.7/site-packages/transformers/pipelines/text2text_generation.py in preprocess(self, truncation, src_lang, tgt_lang, *args)
313 if getattr(self.tokenizer, "_build_translation_inputs", None):
314 return self.tokenizer._build_translation_inputs(
--> 315 *args, return_tensors=self.framework, truncation=truncation, src_lang=src_lang, tgt_lang=tgt_lang
316 )
317 else:
/opt/conda/lib/python3.7/site-packages/transformers/models/m2m_100/tokenization_m2m_100.py in _build_translation_inputs(self, raw_inputs, src_lang, tgt_lang, **extra_kwargs)
351 self.src_lang = src_lang
352 inputs = self(raw_inputs, add_special_tokens=True, **extra_kwargs)
--> 353 tgt_lang_id = self.get_lang_id(tgt_lang)
354 inputs["forced_bos_token_id"] = tgt_lang_id
355 return inputs
/opt/conda/lib/python3.7/site-packages/transformers/models/m2m_100/tokenization_m2m_100.py in get_lang_id(self, lang)
379
380 def get_lang_id(self, lang: str) -> int:
--> 381 lang_token = self.get_lang_token(lang)
382 return self.lang_token_to_id[lang_token]
383
/opt/conda/lib/python3.7/site-packages/transformers/models/m2m_100/tokenization_m2m_100.py in get_lang_token(self, lang)
376
377 def get_lang_token(self, lang: str) -> str:
--> 378 return self.lang_code_to_token[lang]
379
380 def get_lang_id(self, lang: str) -> int:
TypeError: unhashable type: 'list'
One would have expected the output to look something like this instead:
{"de": [{'translation_text': 'Hallo Welt'}, {'translation_text': 'Die Fu Bar'}]
"fr": [{'translation_text': 'Bonjour Monde'}, {'translation_text': 'Le Foo Bar'}]
}
If we use multiple pipelines, are the model mmap and shared? Will it initialize multiple models with multiple tokenizer pairs? Or will it initialize a single model with multiple tokenizers?
from How to use pipeline for multiple target language translations with M2M model in Huggingface?
No comments:
Post a Comment