publications
2023
- ACLGlot500: Scaling Multilingual Corpora and Language Models to 500 LanguagesAyyoob ImaniGooghari , Peiqin Lin , Amir Hossein Kargaran , Silvia Severini , Masoud Jalili Sabet, and 6 more authorsIn Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers) , Jul 2023
@inproceedings{imanigooghari-etal-2023-glot500, title = {Glot500: Scaling Multilingual Corpora and Language Models to 500 Languages}, author = {ImaniGooghari, Ayyoob and Lin, Peiqin and Kargaran, Amir Hossein and Severini, Silvia and Jalili Sabet, Masoud and Kassner, Nora and Ma, Chunlan and Schmid, Helmut and Martins, Andr{\'e} and Yvon, Fran{\c{c}}ois and Sch{\"u}tze, Hinrich}, booktitle = {Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)}, month = jul, year = {2023}, publisher = {Association for Computational Linguistics}, url = {https://aclanthology.org/2023.acl-long.61}, bib = {https://aclanthology.org/2023.acl-long.61.bib}, doi = {10.18653/v1/2023.acl-long.61} }
2022
- EMNLPGraph-Based Multilingual Label Propagation for Low-Resource Part-of-Speech TaggingAyyoob ImaniGooghari , Silvia Severini , Masoud Jalili Sabet, François Yvon , and Hinrich SchützeIn Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing , Dec 2022
Part-of-Speech (POS) tagging is an important component of the NLP pipeline, but many low-resource languages lack labeled data for training. An established method for training a POS tagger in such a scenario is to create a labeled training set by transferring from high-resource languages. In this paper, we propose a novel method for transferring labels from multiple high-resource source to low-resource target languages. We formalize POS tag projection as graph-based label propagation. Given translations of a sentence in multiple languages, we create a graph with words as nodes and alignment links as edges by aligning words for all language pairs. We then propagate node labels from source to target using a Graph Neural Network augmented with transformer layers. We show that our propagation creates training sets that allow us to train POS taggers for a diverse set of languages. When combined with enhanced contextualized embeddings, our method achieves a new state-of-the-art for unsupervised POS tagging of low-resource languages.
@inproceedings{imanigooghari-etal-2022-graph, title = {Graph-Based Multilingual Label Propagation for Low-Resource Part-of-Speech Tagging}, author = {ImaniGooghari, Ayyoob and Severini, Silvia and Jalili Sabet, Masoud and Yvon, Fran{\c{c}}ois and Sch{\"u}tze, Hinrich}, booktitle = {Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing}, month = dec, year = {2022}, address = {Abu Dhabi, United Arab Emirates}, publisher = {Association for Computational Linguistics}, doi = {10.18653/v1/2022.emnlp-main.102}, pages = {1577--1589}, url = {https://aclanthology.org/2022.emnlp-main.102}, bib = {https://aclanthology.org/2022.emnlp-main.102.bib} }
- ACLGraph Neural Networks for Multiparallel Word AlignmentAyyoob Imani , Lütfi Kerem Senel , Masoud Jalili Sabet, François Yvon , and Hinrich SchuetzeIn Findings of the Association for Computational Linguistics: ACL 2022 , May 2022
After a period of decrease, interest in word alignments is increasing again for their usefulness in domains such as typological research, cross-lingual annotation projection and machine translation. Generally, alignment algorithms only use bitext and do not make use of the fact that many parallel corpora are multiparallel. Here, we compute high-quality word alignments between multiple language pairs by considering all language pairs together. First, we create a multiparallel word alignment graph, joining all bilingual word alignment pairs in one graph. Next, we use graph neural networks (GNNs) to exploit the graph structure. Our GNN approach (i) utilizes information about the meaning, position and language of the input words, (ii) incorporates information from multiple parallel sentences, (iii) adds and removes edges from the initial alignments, and (iv) yields a prediction model that can generalize beyond the training sentences. We show that community detection algorithms can provide valuable information for multiparallel word alignment. Our method outperforms previous work on three word alignment datasets and on a downstream task.
@inproceedings{imani-etal-2022-graph, title = {Graph Neural Networks for Multiparallel Word Alignment}, author = {Imani, Ayyoob and Senel, L{\"u}tfi Kerem and Jalili Sabet, Masoud and Yvon, Fran{\c{c}}ois and Schuetze, Hinrich}, booktitle = {Findings of the Association for Computational Linguistics: ACL 2022}, month = may, year = {2022}, address = {Dublin, Ireland}, publisher = {Association for Computational Linguistics}, url = {https://aclanthology.org/2022.findings-acl.108}, bib = {https://aclanthology.org/2022.findings-acl.108.bib}, doi = {10.18653/v1/2022.findings-acl.108}, pages = {1384--1396} }
- ACLCaMEL: Case Marker Extraction without LabelsLeonie Weissweiler , Valentin Hofmann , Masoud Jalili Sabet, and Hinrich SchuetzeIn Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers) , May 2022
We introduce \textbfCaMEL (\textbfCase \textbfMarker \textbfExtraction without \textbfLabels), a novel and challenging task in computational morphology that is especially relevant for low-resource languages. We propose a first model for CaMEL that uses a massively multilingual corpus to extract case markers in 83 languages based only on a noun phrase chunker and an alignment system. To evaluate CaMEL, we automatically construct a silver standard from UniMorph. The case markers extracted by our model can be used to detect and visualise similarities and differences between the case systems of different languages as well as to annotate fine-grained deep cases in languages in which they are not overtly marked.
@inproceedings{weissweiler-etal-2022-camel, title = {{CaMEL}: {C}ase {M}arker {E}xtraction without {L}abels}, author = {Weissweiler, Leonie and Hofmann, Valentin and Jalili Sabet, Masoud and Schuetze, Hinrich}, booktitle = {Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)}, month = may, year = {2022}, address = {Dublin, Ireland}, publisher = {Association for Computational Linguistics}, url = {https://aclanthology.org/2022.acl-long.377}, bib = {https://aclanthology.org/2022.acl-long.377.bib}, doi = {10.18653/v1/2022.acl-long.377}, pages = {5506--5516} }
2021
- EMNLPGraph Algorithms for Multiparallel Word AlignmentAyyoob Imani* , Masoud Jalili Sabet*, Lutfi Kerem Senel , Philipp Dufter , François Yvon , and 1 more authorIn EMNLP , May 2021
With the advent of end-to-end deep learning approaches in machine translation, interest in word alignments initially decreased; however, they have again become a focus of research more recently. Alignments are useful for typological research, transferring formatting like markup to translated texts, and can be used in the decoding of machine translation systems. At the same time, massively multilingual processing is becoming an important NLP scenario, and pretrained language and machine translation models that are truly multilingual are proposed. However, most alignment algorithms rely on bitexts only and do not leverage the fact that many parallel corpora are multiparallel. In this work, we exploit the multiparallelity of corpora by representing an initial set of bilingual alignments as a graph and then predicting additional edges in the graph. We present two graph algorithms for edge prediction: one inspired by recommender systems and one based on network link prediction. Our experimental results show absolute improvements in F1 of up to 28% over the baseline bilingual word aligner in different datasets.
@inproceedings{imanigooghari-etal-2021-graph, title = {Graph Algorithms for Multiparallel Word Alignment}, author = {Imani*, Ayyoob and Jalili Sabet*, Masoud and Senel, Lutfi Kerem and Dufter, Philipp and Yvon, Fran{\c{c}}ois and Sch{\"u}tze, Hinrich}, booktitle = {EMNLP}, year = {2021}, publisher = {Association for Computational Linguistics}, url = {https://aclanthology.org/2021.emnlp-main.665}, bib = {https://aclanthology.org/2021.emnlp-main.665.bib}, doi = {10.18653/v1/2021.emnlp-main.665}, pages = {8457--8469}, }
- ACLParCourE: A Parallel Corpus Explorer for a Massively Multilingual CorpusAyyoob Imani , Masoud Jalili Sabet, Philipp Dufter , Michael Cysou , and Hinrich SchützeIn ACL Demo , May 2021
With more than 7000 languages worldwide, multilingual natural language processing (NLP) is essential both from an academic and commercial perspective. Researching typological properties of languages is fundamental for progress in multilingual NLP. Examples include assessing language similarity for effective transfer learning, injecting inductive biases into machine learning models or creating resources such as dictionaries and inflection tables. We provide ParCourE, an online tool that allows to browse a word-aligned parallel corpus, covering 1334 languages. We give evidence that this is useful for typological research. ParCourE can be set up for any parallel corpus and can thus be used for typological research on other corpora as well as for exploring their quality and properties.
@inproceedings{imanigooghari-etal-2021-parcoure, title = {{P}ar{C}our{E}: A Parallel Corpus Explorer for a Massively Multilingual Corpus}, author = {Imani, Ayyoob and Jalili Sabet, Masoud and Dufter, Philipp and Cysou, Michael and Sch{\"u}tze, Hinrich}, booktitle = {ACL Demo}, year = {2021}, publisher = {Association for Computational Linguistics}, url = {https://aclanthology.org/2021.acl-demo.8}, bib = {https://aclanthology.org/2021.acl-demo.8.bib}, doi = {10.18653/v1/2021.acl-demo.8}, pages = {63--72}, }
2020
- EMNLPSimAlign: High Quality Word Alignments Without Parallel Training Data Using Static and Contextualized EmbeddingsMasoud Jalili Sabet*, Philipp Dufter* , François Yvon , and Hinrich SchützeIn EMNLP Findings , May 2020
Word alignments are useful for tasks like statistical and neural machine translation (NMT) and cross-lingual annotation projection. Statistical word aligners perform well, as do methods that extract alignments jointly with translations in NMT. However, most approaches require parallel training data and quality decreases as less training data is available. We propose word alignment methods that require no parallel data. The key idea is to leverage multilingual word embeddings – both static and contextualized – for word alignment. Our multilingual embeddings are created from monolingual data only without relying on any parallel data or dictionaries. We find that alignments created from embeddings are superior for four and comparable for two language pairs compared to those produced by traditional statistical aligners – even with abundant parallel data; e.g., contextualized embeddings achieve a word alignment F1 for English-German that is 5 percentage points higher than eflomal, a high-quality statistical aligner, trained on 100k parallel sentences.
@inproceedings{jalili-sabet-etal-2020-simalign, title = {{S}im{A}lign: High Quality Word Alignments Without Parallel Training Data Using Static and Contextualized Embeddings}, author = {Jalili Sabet*, Masoud and Dufter*, Philipp and Yvon, Fran{\c{c}}ois and Sch{\"u}tze, Hinrich}, booktitle = {EMNLP Findings}, year = {2020}, publisher = {Association for Computational Linguistics}, url = {https://aclanthology.org/2020.findings-emnlp.147}, bib = {https://aclanthology.org/2020.findings-emnlp.147.bib}, doi = {10.18653/v1/2020.findings-emnlp.147}, pages = {1627--1643}, }
- arXivSubword Sampling for Low Resource Word AlignmentEhsaneddin Asgari* , Masoud Jalili Sabet*, Philipp Dufter , Christopher Ringlstetter , and Hinrich SchützearXiv, May 2020
@article{asgari-etal-2020-sampling, doi = {10.48550/ARXIV.2012.11657}, url = {https://arxiv.org/abs/2012.11657}, author = {Asgari*, Ehsaneddin and Jalili Sabet*, Masoud and Dufter, Philipp and Ringlstetter, Christopher and Schütze, Hinrich}, keywords = {Computation and Language (cs.CL), FOS: Computer and information sciences, FOS: Computer and information sciences}, title = {Subword Sampling for Low Resource Word Alignment}, journal = {arXiv}, year = {2020}, copyright = {Creative Commons Attribution Non Commercial No Derivatives 4.0 International}, }
2019
- ECIRLICD: A Language-Independent Approach for Aspect Category DetectionErfan Ghadery , Sajad Movahedi , Masoud Jalili Sabet, Heshaam Faili , and Azadeh ShakeryIn ECIR , May 2019
Aspect-based sentiment analysis (ABSA) deals with processing and summarizing customer reviews and has been a topic of interest in recent years. Given a set of predefined categories, Aspect Category Detection (ACD), as a subtask of ABSA, aims to assign a subset of these categories to a given review sentence. Thanks to the existence of websites such as Yelp and TripAdvisor, there exist a huge amount of reviews in several languages, and therefore the need for language-independent methods in this task seems necessary. In this paper, we propose Language-Independent Category Detector (LICD), a supervised method based on text matching without the need for any language-specific tools and hand-crafted features for identifying aspect categories. For a given sentence, our proposed method performs ACD based on two hypotheses: First, a category should be assigned to a sentence if there is a high semantic similarity between the sentence and a set of representative words of that category. Second, a category should be assigned to a sentence if sentences with high semantic and structural similarity to that sentence belong to that category. To apply the former hypothesis, we used soft cosine measure, and for the latter, word mover’s distance measure is utilized. Using these two measures, for a given sentence we calculate a set of similarity scores as features for a one-vs-all logistic regression classifier per category. Experimental results on the multilingual SemEval-2016 datasets in the restaurant domain demonstrate that our approach outperforms baseline methods in English, Russian, and Dutch languages, and obtains competitive results with the strong deep neural network-based baselines in French, Turkish, and Spanish languages.
@inproceedings{Ghadery-etal-2019-LICD, author = {Ghadery, Erfan and Movahedi, Sajad and Jalili Sabet, Masoud and Faili, Heshaam and Shakery, Azadeh}, title = {LICD: A Language-Independent Approach for Aspect Category Detection}, booktitle = {ECIR}, year = {2019}, publisher = {Springer International Publishing}, address = {Cham}, pages = {575--589}, }
2017
- MT JournalAutomatic translation memory cleaningMatteo Negri , Duygu Ataman , Masoud Jalili Sabet , Marco Turchi , and Marcello FedericoMachine Translation, May 2017
We address the problem of automatically cleaning a translation memory (TM) by identifying problematic translation units (TUs). In this context, we treat as “problematic TUs” those containing useless translations from the point of view of the user of a computer-assisted translation tool. We approach TM cleaning both as a supervised and as an unsupervised learning problem. In both cases, we take advantage of Translation Memory open-source purifier, an open-source TM cleaning tool also presented in this paper. The two learning paradigms are evaluated on different benchmarks extracted from MyMemory, the world’s largest public TM. Our results indicate the effectiveness of the supervised approach in the ideal condition in which labelled training data is available, and the viability of the unsupervised solution for challenging situations in which training data is not accessible.
@article{Negri2017, affiliation = {Fondazione Bruno Kessler; Fondazione Bruno Kessler, Università degli Studi di Trento; School of Electrical and Computer Engineering, University of Tehran}, author = {Negri, Matteo and Ataman, Duygu and Sabet, Masoud Jalili and Turchi, Marco and Federico, Marcello}, copyright = {Springer Science+Business Media Dordrecht}, doi = {10.1007/s10590-017-9191-5}, journal = {Machine Translation}, keywords = {Translation memories; Machine learning; Data cleaning}, language = {English}, pages = {93-115}, title = {Automatic translation memory cleaning}, year = {2017}, }
2016
- ACLTMop: a Tool for Unsupervised Translation Memory CleaningMasoud Jalili Sabet, Matteo Negri , Marco Turchi , José G. Souza , and Marcello FedericoIn ACL Demo , May 2016
@inproceedings{jalili-sabet-etal-2016-tmop, title = {{TM}op: a Tool for Unsupervised Translation Memory Cleaning}, author = {Jalili Sabet, Masoud and Negri, Matteo and Turchi, Marco and C. de Souza, Jos{\'e} G. and Federico, Marcello}, booktitle = {ACL Demo}, year = {2016}, address = {Berlin, Germany}, publisher = {Association for Computational Linguistics}, url = {https://aclanthology.org/P16-4009}, bib = {https://aclanthology.org/P16-4009.bib}, doi = {10.18653/v1/P16-4009}, pages = {49--54}, }
- ACLAn Unsupervised Method for Automatic Translation Memory CleaningMasoud Jalili Sabet, Matteo Negri , Marco Turchi , and Eduard BarbuIn ACL , May 2016
@inproceedings{jalili-sabet-etal-2016-unsupervised, title = {An Unsupervised Method for Automatic Translation Memory Cleaning}, author = {Jalili Sabet, Masoud and Negri, Matteo and Turchi, Marco and Barbu, Eduard}, booktitle = {ACL}, year = {2016}, address = {Berlin, Germany}, publisher = {Association for Computational Linguistics}, url = {https://aclanthology.org/P16-2047}, bib = {https://aclanthology.org/P16-2047.bib}, doi = {10.18653/v1/P16-2047}, pages = {287--292}, }
- ColingImproving Word Alignment of Rare Words with Word EmbeddingsMasoud Jalili Sabet, Heshaam Faili , and Gholamreza HaffariIn Coling , May 2016
We address the problem of inducing word alignment for language pairs by developing an unsupervised model with the capability of getting applied to other generative alignment models. We approach the task by: i)proposing a new alignment model based on the IBM alignment model 1 that uses vector representation of words, and ii)examining the use of similar source words to overcome the problem of rare source words and improving the alignments. We apply our method to English-French corpora and run the experiments with different sizes of sentence pairs. Our results show competitive performance against the baseline and in some cases improve the results up to 6.9% in terms of precision.
@inproceedings{jalili-sabet-etal-2016-improving, title = {Improving Word Alignment of Rare Words with Word Embeddings}, author = {Jalili Sabet, Masoud and Faili, Heshaam and Haffari, Gholamreza}, booktitle = {Coling}, year = {2016}, address = {Osaka, Japan}, publisher = {The COLING 2016 Organizing Committee}, url = {https://aclanthology.org/C16-1302}, bib = {https://aclanthology.org/C16-1302.bib}, pages = {3209--3215}, }
- ColingLearning to Weight Translations using Ordinal Linear Regression and Query-generated Training Data for Ad-hoc Retrieval with Long QueriesJavid Dadashkarimi , Masoud Jalili Sabet, and Azadeh ShakeryIn Coling , May 2016
Ordinal regression which is known with learning to rank has long been used in information retrieval (IR). Learning to rank algorithms, have been tailored in document ranking, information filtering, and building large aligned corpora successfully. In this paper, we propose to use this algorithm for query modeling in cross-language environments. To this end, first we build a query-generated training data using pseudo-relevant documents to the query and all translation candidates. The pseudo-relevant documents are obtained by top-ranked documents in response to a translation of the original query. The class of each candidate in the training data is determined based on presence/absence of the candidate in the pseudo-relevant documents. We learn an ordinal regression model to score the candidates based on their relevance to the context of the query, and after that, we construct a query-dependent translation model using a softmax function. Finally, we re-weight the query based on the obtained model. Experimental results on French, German, Spanish, and Italian CLEF collections demonstrate that the proposed method achieves better results compared to state-of-the-art cross-language information retrieval methods, particularly in long queries with large training data.
@inproceedings{dadashkarimi-etal-2016-learning, title = {Learning to Weight Translations using Ordinal Linear Regression and Query-generated Training Data for Ad-hoc Retrieval with Long Queries}, author = {Dadashkarimi, Javid and Jalili Sabet, Masoud and Shakery, Azadeh}, booktitle = {Coling}, year = {2016}, address = {Osaka, Japan}, publisher = {The COLING 2016 Organizing Committee}, url = {https://aclanthology.org/C16-1162}, bib = {https://aclanthology.org/C16-1162.bib}, pages = {1725--1733}, }