publications.bib

@inproceedings{boughorbel2025beyond,
  title = {Beyond the Leaderboard: Model Diffing  for Understanding Performance Disparities in {LLM}s},
  author = {Sabri Boughorbel and Fahim Dalvi and Nadir Durrani and Majd Hawasly},
  booktitle = {The 2025 Conference on Empirical Methods in Natural Language Processing},
  year = {2025},
  url = {https://openreview.net/forum?id=BSNycQd27P}
}

@inproceedings{ersoy25_interspeech,
  title = {{From Words to Waves: Analyzing Concept Formation in Speech and Text-Based Foundation Models}},
  author = {Asim Ersoy and Basel Ahmad Mousi and Shammur Absar Chowdhury and Firoj Alam and Fahim I Dalvi and Nadir Durrani},
  booktitle = {Proceedings of the 26th edition of the Interspeech Conference},
  month = {aug},
  year = {2025},
  address = {Rotterdam, Netherlands},
  publisher = {Interspeech 2025},
  doi = {10.21437/Interspeech.2025-2180},
  issn = {2958-1796},
  url = {https://www.isca-archive.org/interspeech_2025/ersoy25_interspeech.pdf},
  pages = {241--245},
  abstract = {The emergence of large language models has demonstrated that systems trained solely on text can acquire extensive world knowledge, develop reasoning capabilities, and internalize abstract semantic concepts - showcasing properties that can be associated with general intelligence. This raises an intriguing question: Do such concepts emerge in models trained on other modalities, such as speech? Furthermore, when models are trained jointly on multiple modalities: Do they develop a richer, more structured semantic understanding? To explore this, we analyze the conceptual structures learned by speech and textual models both individually and jointly. We employ Latent Concept Analysis, an unsupervised method for uncovering and interpreting latent representations in neural networks, to examine how semantic abstractions form across modalities. To support reproducibility, we have released our code along with a curated audio version of the SST-2 dataset for public access.},
  preview = {InterSpeech25.png},
  bibtex_show = {true},
  abbr = {INTERSPEECH'25},
  pdf = {https://www.isca-archive.org/interspeech_2025/ersoy25_interspeech.pdf},
  area = {Latent Concepts},
  bibtex_show = {true}
}

@inproceedings{yu-etal-2024-latent,
  title = {Latent Concept-based Explanation of {NLP} Models},
  author = {Yu, Xuemin and Dalvi, Fahim and Durrani, Nadir and Nouri, Marzia and Sajjad, Hassan},
  editor = {Al-Onaizan, Yaser and Bansal, Mohit and Chen, Yun-Nung},
  booktitle = {Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing},
  month = nov,
  year = {2024},
  address = {Miami, Florida, USA},
  publisher = {Association for Computational Linguistics},
  url = {https://aclanthology.org/2024.emnlp-main.692},
  doi = {10.18653/v1/2024.emnlp-main.692},
  pages = {12435--12459},
  area = {Latent Concepts}
}

@inproceedings{mousi-etal-2024-exploring,
  title = {Exploring Alignment in Shared Cross-lingual Spaces},
  author = {Mousi, Basel and Durrani, Nadir and Dalvi, Fahim and Hawasly, Majd and Abdelali, Ahmed},
  editor = {Ku, Lun-Wei and Martins, Andre and Srikumar, Vivek},
  booktitle = {Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month = aug,
  year = {2024},
  address = {Bangkok, Thailand},
  publisher = {Association for Computational Linguistics},
  url = {https://aclanthology.org/2024.acl-long.344},
  pages = {6326--6348},
  area = {Multilinguality}
}

@inproceedings{hawasly-etal-2024-scaling,
  title = {Scaling up Discovery of Latent Concepts in Deep {NLP} Models},
  author = {Hawasly, Majd and Dalvi, Fahim and Durrani, Nadir},
  editor = {Graham, Yvette and Purver, Matthew},
  booktitle = {Proceedings of the 18th Conference of the European Chapter of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month = mar,
  year = {2024},
  address = {St. Julian{'}s, Malta},
  publisher = {Association for Computational Linguistics},
  url = {https://aclanthology.org/2024.eacl-long.48},
  pages = {793--806},
  area = {Latent Concepts}
}

@article{sajjad-etal-2022-neuron,
  title = {Neuron-level Interpretation of Deep {NLP} Models: A Survey},
  author = {Sajjad, Hassan  and
      Durrani, Nadir  and
      Dalvi, Fahim},
  journal = {Transactions of the Association for Computational Linguistics},
  volume = {10},
  month = {nov},
  year = {2022},
  address = {Cambridge, MA},
  publisher = {MIT Press},
  url = {https://aclanthology.org/2022.tacl-1.74},
  doi = {10.1162/tacl_a_00519},
  pages = {1285--1303},
  abstract = {The proliferation of Deep Neural Networks in various domains has seen an increased need for interpretability of these models. Preliminary work done along this line, and papers that surveyed such, are focused on high-level representation analysis. However, a recent branch of work has concentrated on interpretability at a more granular level of analyzing neurons within these models. In this paper, we survey the work done on neuron analysis including: i) methods to discover and understand neurons in a network; ii) evaluation methods; iii) major findings including cross architectural comparisons that neuron analysis has unraveled; iv) applications of neuron probing such as: controlling the model, domain adaptation, and so forth; and v) a discussion on open issues and future research directions.}
}

@article{chowdhury2024:csl,
  title = {What do end-to-end speech models learn about speaker, language and channel information? A layer-wise and neuron-level analysis},
  journal = {Computer Speech & Language},
  address = {London, UK, UK},
  volume = {83},
  pages = {101539},
  month = {jan},
  year = {2024},
  issn = {0885-2308},
  doi = {https://doi.org/10.1016/j.csl.2023.101539},
  url = {https://www.sciencedirect.com/science/article/pii/S088523082300058X},
  author = {Chowdhury, Shammur Absar and Durrani, Nadir and Ali, Ahmed},
  area = {Representation Analysis}
}

@inproceedings{mousi2023llms,
  title = {Can LLMs Facilitate Interpretation of Pre-trained Language Models?},
  author = {Mousi, Basel  and
      Durrani, Nadir  and
      Dalvi, Fahim},
  booktitle = {Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing},
  month = dec,
  year = {2023},
  publisher = {Association for Computational Linguistics},
  url = {https://browse.arxiv.org/pdf/2305.13386.pdf}
}

@article{JMLR:v24:23-0074,
  author = {Nadir Durrani and Fahim Dalvi and Hassan Sajjad},
  title = {Discovering Salient Neurons in deep NLP models},
  journal = {Journal of Machine Learning Research},
  year = {2023},
  volume = {24},
  number = {362},
  pages = {1--40},
  url = {http://jmlr.org/papers/v24/23-0074.html},
  abstract = {While a lot of work has been done in understanding representations learned within deep NLP models and what knowledge they capture, work done towards analyzing individual neurons is relatively sparse. We present a technique called Linguistic Correlation Analysis to extract salient neurons in the model, with respect to any extrinsic property, with the goal of understanding how such knowledge is preserved within neurons. We carry out a fine-grained analysis to answer the following questions: (i) can we identify subsets of neurons in the network that learn a specific linguistic property? (ii) is a certain linguistic phenomenon in a given model localized (encoded in few individual neurons) or distributed across many neurons? (iii) how redundantly is the information preserved? (iv) how does fine-tuning pre-trained models towards downstream NLP tasks impact the learned linguistic knowledge? (v) how do models vary in learning different linguistic properties? Our data-driven, quantitative analysis illuminates interesting findings: (i) we found small subsets of neurons that can predict different linguistic tasks; (ii) neurons capturing basic lexical information, such as suffixation, are localized in the lowermost layers; (iii) neurons learning complex concepts, such as syntactic role, are predominantly found in middle and higher layers; (iv) salient linguistic neurons are relocated from higher to lower layers during transfer learning, as the network preserves the higher layers for task-specific information; (v) we found interesting differences across pre-trained models regarding how linguistic information is preserved within them; and (vi) we found that concepts exhibit similar neuron distribution across different languages in the multilingual transformer models. Our code is publicly available as part of the NeuroX toolkit (Dalvi et al., 2023).},
  preview = {JMLR23.png},
  bibtex_show = {true},
  abbr = {JMLR'23},
  pdf = {https://jmlr.org/papers/volume24/23-0074/23-0074.pdf},
  area = {Neuron Analysis},
  selected = {true}
}

@inproceedings{fan2023evaluating,
  title = {Evaluating Neuron Interpretation Methods of {NLP} Models},
  author = {Fan, Yimin and Dalvi, Fahim and Durrani, Nadir and Sajjad, Hassan},
  booktitle = {Thirty-seventh Conference on Neural Information Processing Systems},
  month = dec,
  year = {2023}
}

@inproceedings{dalvi-etal-2023-neurox,
  title = {{N}euro{X} Library for Neuron Analysis of Deep {NLP} Models},
  author = {Dalvi, Fahim  and
      Sajjad, Hassan  and
      Durrani, Nadir},
  booktitle = {Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 3: System Demonstrations)},
  month = jul,
  year = {2023},
  address = {Toronto, Canada},
  publisher = {Association for Computational Linguistics},
  url = {https://aclanthology.org/2023.acl-demo.21},
  doi = {10.18653/v1/2023.acl-demo.21},
  pages = {226--234},
  abstract = {Neuron analysis provides insights into how knowledge is structured in representations and discovers the role of neurons in the network. In addition to developing an understanding of our models, neuron analysis enables various applications such as debiasing, domain adaptation and architectural search. We present NeuroX, a comprehensive open-source toolkit to conduct neuron analysis of natural language processing models. It implements various interpretation methods under a unified API, and provides a framework for data processing and evaluation, thus making it easier for researchers and practitioners to perform neuron analysis. The Python toolkit is available at https://www.github.com/fdalvi/NeuroX.Demo Video available at: https://youtu.be/mLhs2YMx4u8}
}

@inproceedings{sajjad-etal-2022-effect,
  title = {Effect of Post-processing on Contextualized Word Representations},
  author = {Sajjad, Hassan  and
      Alam, Firoj  and
      Dalvi, Fahim  and
      Durrani, Nadir},
  booktitle = {Proceedings of the 29th International Conference on Computational Linguistics},
  month = oct,
  year = {2022},
  address = {Gyeongju, Republic of Korea},
  publisher = {International Committee on Computational Linguistics},
  url = {https://aclanthology.org/2022.coling-1.277},
  pages = {3127--3142},
  abstract = {Post-processing of static embedding has been shown to improve their performance on both lexical and sequence-level tasks. However, post-processing for contextualized embeddings is an under-studied problem. In this work, we question the usefulness of post-processing for contextualized embeddings obtained from different layers of pre-trained language models. More specifically, we standardize individual neuron activations using z-score, min-max normalization, and by removing top principal components using the all-but-the-top method. Additionally, we apply unit length normalization to word representations. On a diverse set of pre-trained models, we show that post-processing unwraps vital information present in the representations for both lexical tasks (such as word similarity and analogy) and sequence classification tasks. Our findings raise interesting points in relation to the research studies that use contextualized representations, and suggest z-score normalization as an essential step to consider when using them in an application.}
}

@inproceedings{dalvi-etal-2023-nxplain,
  title = {{N}x{P}lain: A Web-based Tool for Discovery of Latent Concepts},
  author = {Dalvi, Fahim  and
    Durrani, Nadir  and
    Sajjad, Hassan  and
    Jaban, Tamim  and
    Husaini, Mus{'}ab  and
    Abbas, Ummar},
  booktitle = {Proceedings of the 17th Conference of the European Chapter of the Association for Computational Linguistics: System Demonstrations},
  month = may,
  year = {2023},
  address = {Dubrovnik, Croatia},
  publisher = {Association for Computational Linguistics},
  url = {https://aclanthology.org/2023.eacl-demo.10},
  doi = {10.18653/v1/2023.eacl-demo.10},
  pages = {75--83},
  abstract = {The proliferation of deep neural networks in various domains has seen an increased need for the interpretability of these models, especially in scenarios where fairness and trust are as important as model performance. A lot of independent work is being carried out to: i) analyze what linguistic and non-linguistic knowledge is learned within these models, and ii) highlight the salient parts of the input. We present NxPlain, a web-app that provides an explanation of a model{'}s prediction using latent concepts. NxPlain discovers latent concepts learned in a deep NLP model, provides an interpretation of the knowledge learned in the model, and explains its predictions based on the used concepts. The application allows users to browse through the latent concepts in an intuitive order, letting them efficiently scan through the most salient concepts with a global corpus-level view and a local sentence-level view. Our tool is useful for debugging, unraveling model bias, and for highlighting spurious correlations in a model. A hosted demo is available here: https://nxplain.qcri.org}
}

@article{Alam_Dalvi_Durrani_Sajjad_Khan_Xu_2023,
  title = {ConceptX: A Framework for Latent Concept Analysis},
  volume = {37},
  url = {https://ojs.aaai.org/index.php/AAAI/article/view/27057},
  doi = {10.1609/aaai.v37i13.27057},
  abstractnote = {The opacity of deep neural networks remains a challenge in deploying solutions where explanation is as important as precision. We present ConceptX, a human-in-the-loop framework for interpreting and annotating latent representational space in pre-trained Language Models (pLMs). We use an unsupervised method to discover concepts learned in these models and enable a graphical interface for humans to generate explanations for the concepts. To facilitate the process, we provide auto-annotations of the concepts (based on traditional linguistic ontologies). Such annotations enable development of a linguistic resource that directly represents latent concepts learned within deep NLP models. These include not just traditional linguistic concepts, but also task-specific or sensitive concepts (words grouped based on gender or religious connotation) that helps the annotators to mark bias in the model. The framework consists of two parts (i) concept discovery and (ii) annotation platform.},
  number = {13},
  journal = {Proceedings of the AAAI Conference on Artificial Intelligence},
  author = {Alam, Firoj and Dalvi, Fahim and Durrani, Nadir and Sajjad, Hassan and Khan, Abdul Rafae and Xu, Jia},
  year = {2023},
  month = {Sep.},
  pages = {16395-16397}
}

@inproceedings{durrani-etal-2022-transformation,
  title = {On the Transformation of Latent Space in Fine-Tuned {NLP} Models},
  author = {Durrani, Nadir  and
      Sajjad, Hassan  and
      Dalvi, Fahim  and
      Alam, Firoj},
  booktitle = {Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing},
  month = dec,
  year = {2022},
  address = {Abu Dhabi, United Arab Emirates},
  publisher = {Association for Computational Linguistics},
  url = {https://aclanthology.org/2022.emnlp-main.97},
  doi = {10.18653/v1/2022.emnlp-main.97},
  pages = {1495--1516},
  abstract = {We study the evolution of latent space in fine-tuned NLP models. Different from the commonly used probing-framework, we opt for an unsupervised method to analyze representations. More specifically, we discover latent concepts in the representational space using hierarchical clustering. We then use an alignment function to gauge the similarity between the latent space of a pre-trained model and its fine-tuned version. We use traditional linguistic concepts to facilitate our understanding and also study how the model space transforms towards task-specific information. We perform a thorough analysis, comparing pre-trained and fine-tuned models across three models and three downstream tasks. The notable findings of our work are: i) the latent space of the higher layers evolve towards task-specific concepts, ii) whereas the lower layers retain generic concepts acquired in the pre-trained model, iii) we discovered that some concepts in the higher layers acquire polarity towards the output class, and iv) that these concepts can be used for generating adversarial triggers.}
}

@article{sajjad2023:csl,
  address = {London, UK, UK},
  author = {Sajjad, Hassan and Dalvi, Fahim and Durrani, Nadir and Nakov, Preslav},
  issn = {0885-2308},
  doi = {https://doi.org/10.1016/j.csl.2022.101429},
  url = {https://www.sciencedirect.com/science/article/pii/S0885230822000596},
  issue_date = {January 2023},
  journal = {Computer Speech and Language},
  number = {C},
  publisher = {Academic Press Ltd.},
  title = {On the Effect of Dropping Layers of Pre-trained Transformer Models},
  volume = {77},
  pages = {101429},
  month = {jan},
  year = {2023},
  area = {Transfer Learning}
}

@inproceedings{abdelali-2021-arabic-transformers,
  title = {Post-hoc analysis of Arabic transformer models},
  author = {Abdelali, Ahmed  and
      Durrani, Nadir  and
      Dalvi, Fahim   and
      Sajjad, Hassan},
  booktitle = {Proceedings of the Fifth BlackboxNLP Workshop on Analyzing and Interpreting Neural Networks for NLP},
  month = dec,
  year = {2022},
  address = {Abu Dhabi, United Arab Emirates},
  publisher = {Association for Computational Linguistics},
  abstract = {Arabic is a Semitic language which is widely spoken with many dialects. Given the success of pre-trained language models, many transformer models trained on Arabic and its dialects have surfaced. While there have been an extrinsic evaluation of these models with respect to downstream NLP tasks, no work has been carried out to analyze and compare their internal representations. We probe how linguistic information is encoded in the transformer models, trained on different Arabic dialects. We perform a layer and neuron analysis on the models using morphological tagging tasks for different dialects of Arabic and a dialectal identification task. Our analysis enlightens interesting findings such as: i) word morphology is learned at the lower and middle layers, ii) while syntactic dependencies are predominantly captured at the higher layers, iii) despite a large overlap in their vocabulary, the MSA-based models fail to capture the nuances of Arabic dialects, iv) we found that neurons in embedding layers are polysemous in nature, while the neurons in middle layers are exclusive to specific properties.}
}

@inproceedings{sajjad-etal-2022-analyzing,
  title = {Analyzing Encoded Concepts in Transformer Language Models},
  author = {Sajjad, Hassan  and
      Durrani, Nadir  and
      Dalvi, Fahim  and
      Alam, Firoj  and
      Khan, Abdul  and
      Xu, Jia},
  booktitle = {Proceedings of the 2022 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies},
  month = jul,
  year = {2022},
  address = {Seattle, United States},
  publisher = {Association for Computational Linguistics},
  url = {https://aclanthology.org/2022.naacl-main.225},
  doi = {10.18653/v1/2022.naacl-main.225},
  pages = {3082--3101},
  abstract = {We propose a novel framework ConceptX, to analyze how latent concepts are encoded in representations learned within pre-trained lan-guage models. It uses clustering to discover the encoded concepts and explains them by aligning with a large set of human-defined concepts. Our analysis on seven transformer language models reveal interesting insights: i) the latent space within the learned representations overlap with different linguistic concepts to a varying degree, ii) the lower layers in the model are dominated by lexical concepts (e.g., affixation) and linguistic ontologies (e.g. Word-Net), whereas the core-linguistic concepts (e.g., morphology, syntactic relations) are better represented in the middle and higher layers, iii) some encoded concepts are multi-faceted and cannot be adequately explained using the existing human-defined concepts.}
}

@inproceedings{dalvi2020discovering,
  title = {Discovering Latent Concepts Learned in {BERT}},
  author = {Dalvi, Fahim  and  Khan, Abdul  and  Alam, Firoj  and  Durrani, Nadir  and  Xu, Jia  and  Sajjad, Hassan},
  booktitle = { International Conference on Learning Representations },
  month = {may},
  year = {2022},
  url = {https://openreview.net/forum?id=POTMtpYI1xH}
}

@inproceedings{belinkov:2017:acl,
  author = {Belinkov, Yonatan  and  Durrani, Nadir and Dalvi, Fahim and Sajjad, Hassan and Glass, James},
  title = {{What do Neural Machine Translation Models Learn about Morphology?}},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (ACL)},
  month = {July},
  year = {2017},
  address = {Vancouver},
  publisher = {Association for Computational Linguistics},
  url = {https://aclanthology.coli.uni-saarland.de/pdf/P/P17/P17-1080.pdf}
}

@inproceedings{belinkov-etal-2017-evaluating,
  title = {Evaluating Layers of Representation in Neural Machine Translation on Part-of-Speech and Semantic Tagging Tasks},
  author = {Belinkov, Yonatan  and
      M{\`a}rquez, Llu{\'\i}s  and
      Sajjad, Hassan  and
      Durrani, Nadir  and
      Dalvi, Fahim  and
      Glass, James},
  booktitle = {Proceedings of the Eighth International Joint Conference on Natural Language Processing (Volume 1: Long Papers)},
  month = nov,
  year = {2017},
  address = {Taipei, Taiwan},
  publisher = {Asian Federation of Natural Language Processing},
  url = {https://www.aclweb.org/anthology/I17-1001},
  pages = {1--10}
}

@inproceedings{dalvi-etal-2017-understanding,
  title = {Understanding and Improving Morphological Learning in the Neural Machine Translation Decoder},
  author = {Dalvi, Fahim  and
      Durrani, Nadir  and
      Sajjad, Hassan  and
      Belinkov, Yonatan  and
      Vogel, Stephan},
  booktitle = {Proceedings of the Eighth International Joint Conference on Natural Language Processing (Volume 1: Long Papers)},
  month = nov,
  year = {2017},
  address = {Taipei, Taiwan},
  publisher = {Asian Federation of Natural Language Processing},
  url = {https://aclanthology.org/I17-1015},
  pages = {142--151},
  abstract = {End-to-end training makes the neural machine translation (NMT) architecture simpler, yet elegant compared to traditional statistical machine translation (SMT). However, little is known about linguistic patterns of morphology, syntax and semantics learned during the training of NMT systems, and more importantly, which parts of the architecture are responsible for learning each of these phenomenon. In this paper we i) analyze how much morphology an NMT decoder learns, and ii) investigate whether injecting target morphology in the decoder helps it to produce better translations. To this end we present three methods: i) simultaneous translation, ii) joint-data learning, and iii) multi-task learning. Our results show that explicit morphological information helps the decoder learn target language morphology and improves the translation quality by 0.2{--}0.6 BLEU points.}
}

@inproceedings{bau2018identifying,
  title = {Identifying and Controlling Important Neurons in Neural Machine Translation},
  author = {Anthony Bau and Yonatan Belinkov and Hassan Sajjad and Nadir Durrani and Fahim Dalvi and James Glass},
  booktitle = {International Conference on Learning Representations},
  year = {2019},
  url = {https://openreview.net/forum?id=H1z-PsR5KX}
}

@inproceedings{dalvi:2019:AAAI,
  title = {What Is One Grain of Sand in the Desert? Analyzing Individual Neurons in Deep NLP Models},
  author = {Dalvi, Fahim and  Durrani, Nadir and Sajjad, Hassan and Belinkov, Yonatan and Bau, D. Anthony and Glass, James},
  booktitle = {Proceedings of the Thirty-Third AAAI Conference on Artificial Intelligence (AAAI, Oral presentation)},
  year = {2019},
  url = {https://ojs.aaai.org/index.php/AAAI/article/view/4592/4470},
  month = {January}
}

@inproceedings{neurox-aaai19:demo,
  author = {Fahim Dalvi and Avery Nortonsmith and D. Anthony Bau and Yonatan Belinkov and Hassan Sajjad and Nadir Durrani and James Glass},
  booktitle = {AAAI Conference on Artificial Intelligence (AAAI)},
  location = {Honolulu, USA},
  month = {January},
  title = {NeuroX: A Toolkit for Analyzing Individual Neurons in Neural Networks},
  url = {https://www.aaai.org/ojs/index.php/AAAI/article/view/5063},
  year = {2019}
}

@inproceedings{durrani-etal-2019-one,
  title = {One Size Does Not Fit All: Comparing {NMT} Representations of Different Granularities},
  author = {Durrani, Nadir  and
      Dalvi, Fahim  and
      Sajjad, Hassan  and
      Belinkov, Yonatan  and
      Nakov, Preslav},
  booktitle = {Proceedings of the 2019 Conference of the North {A}merican Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers)},
  month = jun,
  year = {2019},
  address = {Minneapolis, Minnesota},
  publisher = {Association for Computational Linguistics},
  url = {https://www.aclweb.org/anthology/N19-1154},
  doi = {10.18653/v1/N19-1154},
  pages = {1504--1516}
}

@inproceedings{durrani-etal-2020-analyzing,
  title = {Analyzing Individual Neurons in Pre-trained Language Models},
  author = {Durrani, Nadir  and
      Sajjad, Hassan  and
      Dalvi, Fahim  and
      Belinkov, Yonatan},
  booktitle = {Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP)},
  month = nov,
  year = {2020},
  address = {Online},
  publisher = {Association for Computational Linguistics},
  url = {https://www.aclweb.org/anthology/2020.emnlp-main.395},
  doi = {10.18653/v1/2020.emnlp-main.395},
  pages = {4865--4880},
  abstract = {While a lot of analysis has been carried to demonstrate linguistic knowledge captured by the representations learned within deep NLP models, very little attention has been paid towards individual neurons.We carry outa neuron-level analysis using core linguistic tasks of predicting morphology, syntax and semantics, on pre-trained language models, with questions like: i) do individual neurons in pre-trained models capture linguistic information? ii) which parts of the network learn more about certain linguistic phenomena? iii) how distributed or focused is the information? and iv) how do various architectures differ in learning these properties? We found small subsets of neurons to predict linguistic tasks, with lower level tasks (such as morphology) localized in fewer neurons, compared to higher level task of predicting syntax. Our study also reveals interesting cross architectural comparisons. For example, we found neurons in XLNet to be more localized and disjoint when predicting properties compared to BERT and others, where they are more distributed and coupled.}
}

@inproceedings{dalvi-etal-2020-analyzing,
  title = {Analyzing Redundancy in Pretrained Transformer Models},
  author = {Dalvi, Fahim  and
      Sajjad, Hassan  and
      Durrani, Nadir  and
      Belinkov, Yonatan},
  booktitle = {Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP)},
  month = nov,
  year = {2020},
  address = {Online},
  publisher = {Association for Computational Linguistics},
  url = {https://aclanthology.org/2020.emnlp-main.398},
  doi = {10.18653/v1/2020.emnlp-main.398},
  pages = {4908--4926},
  abstract = {Transformer-based deep NLP models are trained using hundreds of millions of parameters, limiting their applicability in computationally constrained environments. In this paper, we study the cause of these limitations by defining a notion of Redundancy, which we categorize into two classes: General Redundancy and Task-specific Redundancy. We dissect two popular pretrained models, BERT and XLNet, studying how much redundancy they exhibit at a representation-level and at a more fine-grained neuron-level. Our analysis reveals interesting insights, such as i) 85{\%} of the neurons across the network are redundant and ii) at least 92{\%} of them can be removed when optimizing towards a downstream task. Based on our analysis, we present an efficient feature-based transfer learning procedure, which maintains 97{\%} performance while using at-most 10{\%} of the original neurons.}
}

@article{belinkov-etal-2020-linguistic,
  title = {On the Linguistic Representational Power of Neural Machine Translation Models},
  author = {Belinkov, Yonatan  and
      Durrani, Nadir  and
      Dalvi, Fahim  and
      Sajjad, Hassan  and
      Glass, James},
  journal = {Computational Linguistics},
  volume = {46},
  number = {1},
  month = mar,
  year = {2020},
  url = {https://www.aclweb.org/anthology/2020.cl-1.1},
  doi = {10.1162/coli_a_00367},
  pages = {1--52},
  abstract = {Despite the recent success of deep neural networks in natural language processing and other spheres of artificial intelligence, their interpretability remains a challenge. We analyze the representations learned by neural machine translation (NMT) models at various levels of granularity and evaluate their quality through relevant extrinsic properties. In particular, we seek answers to the following questions: (i) How accurately is word structure captured within the learned representations, which is an important aspect in translating morphologically rich languages? (ii) Do the representations capture long-range dependencies, and effectively handle syntactically divergent languages? (iii) Do the representations capture lexical semantics? We conduct a thorough investigation along several parameters: (i) Which layers in the architecture capture each of these linguistic phenomena; (ii) How does the choice of translation unit (word, character, or subword unit) impact the linguistic properties captured by the underlying representations? (iii) Do the encoder and decoder learn differently and independently? (iv) Do the representations learned by multilingual NMT models capture the same amount of linguistic information as their bilingual counterparts? Our data-driven, quantitative evaluation illuminates important aspects in NMT models and their ability to capture various linguistic phenomena. We show that deep NMT models trained in an end-to-end fashion, without being provided any direct supervision during the training process, learn a non-trivial amount of linguistic information. Notable findings include the following observations: (i) Word morphology and part-of-speech information are captured at the lower layers of the model; (ii) In contrast, lexical semantics or non-local syntactic and semantic dependencies are better represented at the higher layers of the model; (iii) Representations learned using characters are more informed about word-morphology compared to those learned using subword units; and (iv) Representations learned by multilingual models are richer compared to bilingual models.}
}

@inproceedings{wu-etal-2020-similarity,
  title = {Similarity Analysis of Contextual Word Representation Models},
  author = {Wu, John  and
      Belinkov, Yonatan  and
      Sajjad, Hassan  and
      Durrani, Nadir  and
      Dalvi, Fahim  and
      Glass, James},
  booktitle = {Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics},
  month = jul,
  year = {2020},
  address = {Online},
  publisher = {Association for Computational Linguistics},
  url = {https://aclanthology.org/2020.acl-main.422},
  doi = {10.18653/v1/2020.acl-main.422},
  pages = {4638--4655},
  abstract = {This paper investigates contextual word representation models from the lens of similarity analysis. Given a collection of trained models, we measure the similarity of their internal representations and attention. Critically, these models come from vastly different architectures. We use existing and novel similarity measures that aim to gauge the level of localization of information in the deep models, and facilitate the investigation of which design factors affect model similarity, without requiring any external linguistic annotation. The analysis reveals that models within the same family are more similar to one another, as may be expected. Surprisingly, different architectures have rather similar representations, but different individual neurons. We also observed differences in information localization in lower and higher layers and found that higher layers are more affected by fine-tuning on downstream tasks.}
}

@inproceedings{sajjad-etal-2021-fine,
  title = {Fine-grained Interpretation and Causation Analysis in Deep {NLP} Models},
  author = {Sajjad, Hassan  and
      Kokhlikyan, Narine  and
      Dalvi, Fahim  and
      Durrani, Nadir},
  booktitle = {Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies: Tutorials},
  month = jun,
  year = {2021},
  address = {Online},
  publisher = {Association for Computational Linguistics},
  url = {https://aclanthology.org/2021.naacl-tutorials.2},
  doi = {10.18653/v1/2021.naacl-tutorials.2},
  pages = {5--10},
  abstract = {Deep neural networks have constantly pushed the state-of-the-art performance in natural language processing and are considered as the de-facto modeling approach in solving complex NLP tasks such as machine translation, summarization and question-answering. Despite the proven efficacy of deep neural networks at-large, their opaqueness is a major cause of concern. In this tutorial, we will present research work on interpreting fine-grained components of a neural network model from two perspectives, i) fine-grained interpretation, and ii) causation analysis. The former is a class of methods to analyze neurons with respect to a desired language concept or a task. The latter studies the role of neurons and input features in explaining the decisions made by the model. We will also discuss how interpretation methods and causation analysis can connect towards better interpretability of model prediction. Finally, we will walk you through various toolkits that facilitate fine-grained interpretation and causation analysis of neural models.}
}

@inproceedings{durrani-etal-2021-transfer,
  title = {How transfer learning impacts linguistic knowledge in deep {NLP} models?},
  author = {Durrani, Nadir  and
      Sajjad, Hassan  and
      Dalvi, Fahim},
  booktitle = {Findings of the Association for Computational Linguistics: ACL-IJCNLP 2021},
  month = aug,
  year = {2021},
  address = {Online},
  publisher = {Association for Computational Linguistics},
  url = {https://aclanthology.org/2021.findings-acl.438},
  doi = {10.18653/v1/2021.findings-acl.438},
  pages = {4947--4957}
}

This file was generated by bibtex2html 1.99.