publications.bib

@inproceedings{dalvi2020discovering,
  title = {Discovering Latent Concepts Learned in {BERT}},
  author = {Dalvi, Fahim  and  Khan, Abdul  and  Alam, Firoj  and  Durrani, Nadir  and  Xu, Jia  and  Sajjad, Hassan},
  booktitle = { International Conference on Learning Representations },
  year = {2022},
  url = {https://openreview.net/forum?id=POTMtpYI1xH}
}
@inproceedings{belinkov:2017:acl,
  author = {Belinkov, Yonatan  and  Durrani, Nadir and Dalvi, Fahim and Sajjad, Hassan and Glass, James},
  title = {{What do Neural Machine Translation Models Learn about Morphology?}},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (ACL)},
  month = {July},
  year = {2017},
  address = {Vancouver},
  publisher = {Association for Computational Linguistics},
  url = {https://aclanthology.coli.uni-saarland.de/pdf/P/P17/P17-1080.pdf}
}
@inproceedings{belinkov-etal-2017-evaluating,
  title = {Evaluating Layers of Representation in Neural Machine Translation on Part-of-Speech and Semantic Tagging Tasks},
  author = {Belinkov, Yonatan  and
      M{\`a}rquez, Llu{\'\i}s  and
      Sajjad, Hassan  and
      Durrani, Nadir  and
      Dalvi, Fahim  and
      Glass, James},
  booktitle = {Proceedings of the Eighth International Joint Conference on Natural Language Processing (Volume 1: Long Papers)},
  month = nov,
  year = {2017},
  address = {Taipei, Taiwan},
  publisher = {Asian Federation of Natural Language Processing},
  url = {https://www.aclweb.org/anthology/I17-1001},
  pages = {1--10}
}
@inproceedings{dalvi-etal-2017-understanding,
  title = {Understanding and Improving Morphological Learning in the Neural Machine Translation Decoder},
  author = {Dalvi, Fahim  and
      Durrani, Nadir  and
      Sajjad, Hassan  and
      Belinkov, Yonatan  and
      Vogel, Stephan},
  booktitle = {Proceedings of the Eighth International Joint Conference on Natural Language Processing (Volume 1: Long Papers)},
  month = nov,
  year = {2017},
  address = {Taipei, Taiwan},
  publisher = {Asian Federation of Natural Language Processing},
  url = {https://aclanthology.org/I17-1015},
  pages = {142--151},
  abstract = {End-to-end training makes the neural machine translation (NMT) architecture simpler, yet elegant compared to traditional statistical machine translation (SMT). However, little is known about linguistic patterns of morphology, syntax and semantics learned during the training of NMT systems, and more importantly, which parts of the architecture are responsible for learning each of these phenomenon. In this paper we i) analyze how much morphology an NMT decoder learns, and ii) investigate whether injecting target morphology in the decoder helps it to produce better translations. To this end we present three methods: i) simultaneous translation, ii) joint-data learning, and iii) multi-task learning. Our results show that explicit morphological information helps the decoder learn target language morphology and improves the translation quality by 0.2{--}0.6 BLEU points.}
}
@inproceedings{bau2018identifying,
  title = {Identifying and Controlling Important Neurons in Neural Machine Translation},
  author = {Anthony Bau and Yonatan Belinkov and Hassan Sajjad and Nadir Durrani and Fahim Dalvi and James Glass},
  booktitle = {International Conference on Learning Representations},
  year = {2019},
  url = {https://openreview.net/forum?id=H1z-PsR5KX}
}
@inproceedings{dalvi:2019:AAAI,
  title = {What Is One Grain of Sand in the Desert? Analyzing Individual Neurons in Deep NLP Models},
  author = {Dalvi, Fahim and  Durrani, Nadir and Sajjad, Hassan and Belinkov, Yonatan and Bau, D. Anthony and Glass, James},
  booktitle = {Proceedings of the Thirty-Third AAAI Conference on Artificial Intelligence (AAAI, Oral presentation)},
  year = {2019},
  url = {https://ojs.aaai.org/index.php/AAAI/article/view/4592/4470},
  month = {January}
}
@inproceedings{neurox-aaai19:demo,
  author = {Fahim Dalvi and Avery Nortonsmith and D. Anthony Bau and Yonatan Belinkov and Hassan Sajjad and Nadir Durrani and James Glass},
  booktitle = {AAAI Conference on Artificial Intelligence (AAAI)},
  location = {Honolulu, USA},
  month = {January},
  title = {NeuroX: A Toolkit for Analyzing Individual Neurons in Neural Networks},
  url = {https://www.aaai.org/ojs/index.php/AAAI/article/view/5063},
  year = {2019}
}
@inproceedings{durrani-etal-2019-one,
  title = {One Size Does Not Fit All: Comparing {NMT} Representations of Different Granularities},
  author = {Durrani, Nadir  and
      Dalvi, Fahim  and
      Sajjad, Hassan  and
      Belinkov, Yonatan  and
      Nakov, Preslav},
  booktitle = {Proceedings of the 2019 Conference of the North {A}merican Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers)},
  month = jun,
  year = {2019},
  address = {Minneapolis, Minnesota},
  publisher = {Association for Computational Linguistics},
  url = {https://www.aclweb.org/anthology/N19-1154},
  doi = {10.18653/v1/N19-1154},
  pages = {1504--1516}
}
@inproceedings{durrani-etal-2020-analyzing,
  title = {Analyzing Individual Neurons in Pre-trained Language Models},
  author = {Durrani, Nadir  and
      Sajjad, Hassan  and
      Dalvi, Fahim  and
      Belinkov, Yonatan},
  booktitle = {Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP)},
  month = nov,
  year = {2020},
  address = {Online},
  publisher = {Association for Computational Linguistics},
  url = {https://www.aclweb.org/anthology/2020.emnlp-main.395},
  doi = {10.18653/v1/2020.emnlp-main.395},
  pages = {4865--4880},
  abstract = {While a lot of analysis has been carried to demonstrate linguistic knowledge captured by the representations learned within deep NLP models, very little attention has been paid towards individual neurons.We carry outa neuron-level analysis using core linguistic tasks of predicting morphology, syntax and semantics, on pre-trained language models, with questions like: i) do individual neurons in pre-trained models capture linguistic information? ii) which parts of the network learn more about certain linguistic phenomena? iii) how distributed or focused is the information? and iv) how do various architectures differ in learning these properties? We found small subsets of neurons to predict linguistic tasks, with lower level tasks (such as morphology) localized in fewer neurons, compared to higher level task of predicting syntax. Our study also reveals interesting cross architectural comparisons. For example, we found neurons in XLNet to be more localized and disjoint when predicting properties compared to BERT and others, where they are more distributed and coupled.}
}
@inproceedings{dalvi-etal-2020-analyzing,
  title = {Analyzing Redundancy in Pretrained Transformer Models},
  author = {Dalvi, Fahim  and
      Sajjad, Hassan  and
      Durrani, Nadir  and
      Belinkov, Yonatan},
  booktitle = {Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP)},
  month = nov,
  year = {2020},
  address = {Online},
  publisher = {Association for Computational Linguistics},
  url = {https://aclanthology.org/2020.emnlp-main.398},
  doi = {10.18653/v1/2020.emnlp-main.398},
  pages = {4908--4926},
  abstract = {Transformer-based deep NLP models are trained using hundreds of millions of parameters, limiting their applicability in computationally constrained environments. In this paper, we study the cause of these limitations by defining a notion of Redundancy, which we categorize into two classes: General Redundancy and Task-specific Redundancy. We dissect two popular pretrained models, BERT and XLNet, studying how much redundancy they exhibit at a representation-level and at a more fine-grained neuron-level. Our analysis reveals interesting insights, such as i) 85{\%} of the neurons across the network are redundant and ii) at least 92{\%} of them can be removed when optimizing towards a downstream task. Based on our analysis, we present an efficient feature-based transfer learning procedure, which maintains 97{\%} performance while using at-most 10{\%} of the original neurons.}
}
@article{belinkov-etal-2020-linguistic,
  title = {On the Linguistic Representational Power of Neural Machine Translation Models},
  author = {Belinkov, Yonatan  and
      Durrani, Nadir  and
      Dalvi, Fahim  and
      Sajjad, Hassan  and
      Glass, James},
  journal = {Computational Linguistics},
  volume = {46},
  number = {1},
  month = mar,
  year = {2020},
  url = {https://www.aclweb.org/anthology/2020.cl-1.1},
  doi = {10.1162/coli_a_00367},
  pages = {1--52},
  abstract = {Despite the recent success of deep neural networks in natural language processing and other spheres of artificial intelligence, their interpretability remains a challenge. We analyze the representations learned by neural machine translation (NMT) models at various levels of granularity and evaluate their quality through relevant extrinsic properties. In particular, we seek answers to the following questions: (i) How accurately is word structure captured within the learned representations, which is an important aspect in translating morphologically rich languages? (ii) Do the representations capture long-range dependencies, and effectively handle syntactically divergent languages? (iii) Do the representations capture lexical semantics? We conduct a thorough investigation along several parameters: (i) Which layers in the architecture capture each of these linguistic phenomena; (ii) How does the choice of translation unit (word, character, or subword unit) impact the linguistic properties captured by the underlying representations? (iii) Do the encoder and decoder learn differently and independently? (iv) Do the representations learned by multilingual NMT models capture the same amount of linguistic information as their bilingual counterparts? Our data-driven, quantitative evaluation illuminates important aspects in NMT models and their ability to capture various linguistic phenomena. We show that deep NMT models trained in an end-to-end fashion, without being provided any direct supervision during the training process, learn a non-trivial amount of linguistic information. Notable findings include the following observations: (i) Word morphology and part-of-speech information are captured at the lower layers of the model; (ii) In contrast, lexical semantics or non-local syntactic and semantic dependencies are better represented at the higher layers of the model; (iii) Representations learned using characters are more informed about word-morphology compared to those learned using subword units; and (iv) Representations learned by multilingual models are richer compared to bilingual models.}
}
@inproceedings{wu-etal-2020-similarity,
  title = {Similarity Analysis of Contextual Word Representation Models},
  author = {Wu, John  and
      Belinkov, Yonatan  and
      Sajjad, Hassan  and
      Durrani, Nadir  and
      Dalvi, Fahim  and
      Glass, James},
  booktitle = {Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics},
  month = jul,
  year = {2020},
  address = {Online},
  publisher = {Association for Computational Linguistics},
  url = {https://aclanthology.org/2020.acl-main.422},
  doi = {10.18653/v1/2020.acl-main.422},
  pages = {4638--4655},
  abstract = {This paper investigates contextual word representation models from the lens of similarity analysis. Given a collection of trained models, we measure the similarity of their internal representations and attention. Critically, these models come from vastly different architectures. We use existing and novel similarity measures that aim to gauge the level of localization of information in the deep models, and facilitate the investigation of which design factors affect model similarity, without requiring any external linguistic annotation. The analysis reveals that models within the same family are more similar to one another, as may be expected. Surprisingly, different architectures have rather similar representations, but different individual neurons. We also observed differences in information localization in lower and higher layers and found that higher layers are more affected by fine-tuning on downstream tasks.}
}
@inproceedings{sajjad-etal-2021-fine,
  title = {Fine-grained Interpretation and Causation Analysis in Deep {NLP} Models},
  author = {Sajjad, Hassan  and
      Kokhlikyan, Narine  and
      Dalvi, Fahim  and
      Durrani, Nadir},
  booktitle = {Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies: Tutorials},
  month = jun,
  year = {2021},
  address = {Online},
  publisher = {Association for Computational Linguistics},
  url = {https://aclanthology.org/2021.naacl-tutorials.2},
  doi = {10.18653/v1/2021.naacl-tutorials.2},
  pages = {5--10},
  abstract = {Deep neural networks have constantly pushed the state-of-the-art performance in natural language processing and are considered as the de-facto modeling approach in solving complex NLP tasks such as machine translation, summarization and question-answering. Despite the proven efficacy of deep neural networks at-large, their opaqueness is a major cause of concern. In this tutorial, we will present research work on interpreting fine-grained components of a neural network model from two perspectives, i) fine-grained interpretation, and ii) causation analysis. The former is a class of methods to analyze neurons with respect to a desired language concept or a task. The latter studies the role of neurons and input features in explaining the decisions made by the model. We will also discuss how interpretation methods and causation analysis can connect towards better interpretability of model prediction. Finally, we will walk you through various toolkits that facilitate fine-grained interpretation and causation analysis of neural models.}
}
@inproceedings{durrani-etal-2021-transfer,
  title = {How transfer learning impacts linguistic knowledge in deep {NLP} models?},
  author = {Durrani, Nadir  and
      Sajjad, Hassan  and
      Dalvi, Fahim},
  booktitle = {Findings of the Association for Computational Linguistics: ACL-IJCNLP 2021},
  month = aug,
  year = {2021},
  address = {Online},
  publisher = {Association for Computational Linguistics},
  url = {https://aclanthology.org/2021.findings-acl.438},
  doi = {10.18653/v1/2021.findings-acl.438},
  pages = {4947--4957}
}
@article{sajjad2020poorBERT,
  author = {Hassan Sajjad and
               Fahim Dalvi and
               Nadir Durrani and
               Preslav Nakov},
  title = {Poor Man's {BERT:} Smaller and Faster Transformer Models},
  journal = {CoRR},
  volume = {abs/2004.03844},
  year = {2020},
  url = {https://arxiv.org/abs/2004.03844},
  eprinttype = {arXiv},
  eprint = {2004.03844},
  timestamp = {Tue, 14 Apr 2020 16:40:34 +0200},
  biburl = {https://dblp.org/rec/journals/corr/abs-2004-03844.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}
@misc{sajjad2021neuronlevel,
  title = {Neuron-level Interpretation of Deep NLP Models: A Survey},
  author = {Hassan Sajjad and Nadir Durrani and Fahim Dalvi},
  year = {2021},
  eprint = {2108.13138},
  archiveprefix = {arXiv},
  primaryclass = {cs.CL}
}
@misc{sajjad2021effect,
  title = {Effect of Post-processing on Contextualized Word Representations},
  author = {Hassan Sajjad and Firoj Alam and Fahim Dalvi and Nadir Durrani},
  year = {2021},
  eprint = {2104.07456},
  archiveprefix = {arXiv},
  primaryclass = {cs.CL}
}
@misc{chowdhury2021endtoend,
  title = {What do End-to-End Speech Models Learn about Speaker, Language and Channel Information? A Layer-wise and Neuron-level Analysis},
  author = {Shammur Absar Chowdhury and Nadir Durrani and Ahmed Ali},
  year = {2021},
  eprint = {2107.00439},
  archiveprefix = {arXiv},
  primaryclass = {cs.CL}
}

This file was generated by bibtex2html 1.99.