"""Module for Probeless method
This module extracts neuron ranking for a label/tag (e.g Verbs) or for an
entire property set (e.g Part of speech) without training any probes.
.. seealso::
`Antverg, Omer and Belinkov, Yonatan "On The Pitfalls Of Analyzing Idividual Neurons in Language Models." In Proceedings of the 10th International Conference on Learning Representations (ICLR). <https://arxiv.org/abs/2110.07483>`_
import numpy as np
import itertools
def _get_mean_vectors(X_train, y_train):
embeddings_by_labels = {}
average_embeddings_by_label = {}
rankings_per_label = {}
for words_idx in range(y_train.size):
label = y_train[words_idx]
if (label in embeddings_by_labels):
vector = []
embeddings_by_labels[label] = vector
for k, v in embeddings_by_labels.items():
average_embeddings_by_label[k] = np.mean(v, axis=0)
avg_embeddings = list(average_embeddings_by_label.values())
return avg_embeddings, average_embeddings_by_label
def _get_overall_ranking(mean_vectors):
overall = np.zeros_like(mean_vectors[0])
for couple in itertools.combinations(mean_vectors, 2):
overall += np.abs(couple[0] - couple[1])
ranking = np.argsort(overall)[::-1].tolist()
return ranking
def _get_tag_wise_ranking(mean_vectors_by_label, tag):
qz = mean_vectors_by_label[tag]
summation = np.abs(np.subtract(qz, qz))
for c, qzz in mean_vectors_by_label.items():
if (tag != c):
summation = np.add(summation, np.abs(np.subtract(qz,qzz)))
ranking = np.argsort(summation)[::-1].tolist()
return summation, ranking
[docs]def get_neuron_ordering(X_train, y_train):
Returns a list of top neurons w.r.t the overall task e.g. POS
X_train : numpy.ndarray
Numpy Matrix of size [``NUM_TOKENS`` x ``NUM_NEURONS``]. Usually the
output of ``interpretation.utils.create_tensors``
y_train : numpy.ndarray
Numpy Vector of size [``NUM_TOKENS``] with class labels for each input
token. Usually the output of ``interpretation.utils.create_tensors``.
ranking : list
list of ``NUM_NEURONS`` neuron indices, in decreasing order of importance.
avg_embeddings, average_embeddings_by_label = _get_mean_vectors(X_train, y_train)
ranking = _get_overall_ranking(avg_embeddings)
return ranking
[docs]def get_neuron_ordering_for_tag(X_train, y_train, label2idx, tag):
Returns a list of top neurons w.r.t a tag e.g. noun
X_train : numpy.ndarray
Numpy Matrix of size [``NUM_TOKENS`` x ``NUM_NEURONS``]. Usually the
output of ``interpretation.utils.create_tensors``
y_train : numpy.ndarray
Numpy Vector of size [``NUM_TOKENS``] with class labels for each input
token. Usually the output of ``interpretation.utils.create_tensors``.
label2idx: dict
Class name to index mapping. Usually returned by
tag : string
tag for which rankings are extracted
ranking : list
list of ``NUM_NEURONS`` neuron indices, in decreasing order of importance.
avg_embeddings, average_embeddings_by_label = _get_mean_vectors(X_train, y_train)
summation, ranking = _get_tag_wise_ranking(average_embeddings_by_label, label2idx[tag])
return ranking