Source code for neurox.data.representations

"""Utility functions to manage representations.

This module contains functions that will help in managing extracted
representations, specifically on sub-word based data.
"""
import numpy as np
from tqdm import tqdm


[docs]def bpe_get_avg_activations(tokens, activations): """Aggregates activations by averaging assuming BPE-based tokenization. Given loaded tokens data and activations, this function aggeregates activations based on tokenized text. BPE based tokenization is assumed, with every non-terminal subword ending with "@@". The activations are aggregated by averaging over subwords. .. warning:: This function is deprecated and will be removed in future versions. Parameters ---------- tokens : dict Dictionary containing three lists, ``source``, ``source_aux`` and ``target``. Usually the output of ``data.loader.load_aux_data``. activations : list of numpy.ndarray Activations returned from ``loader.load_activations``. Returns ------- activations : list of numpy.ndarray Subword aggregated activations corresponding to one per actual token found in the untokenized text. """ all_activations = [] num_neurons = activations[0].size(1) for i in range(0, len(tokens["source_aux"])): sourceIndex = 0 thisBPE = "" source = tokens["source"][i] source_aux = tokens["source_aux"][i] num_words = len(source) new_activations = np.zeros((num_words, num_neurons)) word_boundaries = [] for j in range(0, len(tokens["source_aux"][i])): currSourceWord = tokens["source"][i][sourceIndex] thisBPE = thisBPE + tokens["source_aux"][i][j] if thisBPE != currSourceWord: thisBPE = thisBPE[:-2] else: word_boundaries.append(j) sourceIndex = sourceIndex + 1 thisBPE = "" assert len(word_boundaries) == num_words prev_idx = 0 for word_idx, boundary in enumerate(word_boundaries): avg_vector = np.average(activations[i][prev_idx : boundary + 1, :], axis=0) new_activations[word_idx, :] = avg_vector prev_idx = boundary + 1 all_activations.append(new_activations) return all_activations
[docs]def bpe_get_last_activations(tokens, activations, is_brnn=True): """Aggregates activations by picking the last subword assuming BPE-based tokenization. Given loaded tokens data and activations, this function aggeregates activations based on tokenized text. BPE based tokenization is assumed, with every non-terminal subword ending with "@@". The activations are aggregated by picking the last subword for any given word. .. warning:: This function is deprecated and will be removed in future versions. Parameters ---------- tokens : dict Dictionary containing three lists, ``source``, ``source_aux`` and ``target``. Usually the output of ``data.loader.load_aux_data``. activations : list of numpy.ndarray Activations returned from ``loader.load_activations``. is_brnn : bool, optional Whether the model from which activations were extracted was bidirectional. Only applies for RNN models. Returns ------- activations : list of numpy.ndarray Subword aggregated activations corresponding to one per actual token found in the untokenized text. """ all_activations = [] num_neurons = activations[0].size(1) for i in range(0, len(tokens["source_aux"])): sourceIndex = 0 thisBPE = "" source = tokens["source"][i] source_aux = tokens["source_aux"][i] num_words = len(source) new_activations = np.zeros((num_words, num_neurons)) word_boundaries = [] for j in range(0, len(tokens["source_aux"][i])): currSourceWord = tokens["source"][i][sourceIndex] thisBPE = thisBPE + tokens["source_aux"][i][j] if thisBPE != currSourceWord: thisBPE = thisBPE[:-2] else: word_boundaries.append(j) sourceIndex = sourceIndex + 1 thisBPE = "" assert len(word_boundaries) == num_words rnn_boundary = int(num_neurons / 2) if not is_brnn: rnn_boundary = num_neurons prev_idx = 0 for word_idx, boundary in enumerate(word_boundaries): # 0 - num_neurons/2: Forward # num_neurons/2 - : Backward new_activations[word_idx, :rnn_boundary] = activations[i][ boundary, :rnn_boundary ] if is_brnn: new_activations[word_idx, rnn_boundary:] = activations[i][ prev_idx, rnn_boundary: ] prev_idx = boundary + 1 all_activations.append(new_activations) return all_activations
[docs]def char_get_avg_activations(tokens, activations): """Aggregates activations by averaging assuming Character-based tokenization. Given loaded tokens data and activations, this function aggeregates activations based on character-tokenized text. The activations are aggregated by averaging over characters. .. warning:: This function is deprecated and will be removed in future versions. Parameters ---------- tokens : dict Dictionary containing three lists, ``source``, ``source_aux`` and ``target``. Usually the output of ``data.loader.load_aux_data``. activations : list of numpy.ndarray Activations returned from ``loader.load_activations``. Returns ------- activations : list of numpy.ndarray Character aggregated activations corresponding to one per actual token found in the untokenized text. """ all_activations = [] num_neurons = activations[0].size(1) for i in tqdm(range(0, len(tokens["source_aux"]))): sourceIndex = 0 thisChar = "" source = tokens["source"][i] source_aux = tokens["source_aux"][i] num_words = len(source) new_activations = np.zeros((num_words, num_neurons)) word_boundaries = [] for word_idx, word in enumerate(tokens["source"][i]): if word_idx == 0: word_boundaries.append(len(word) - 1) else: word_boundaries.append(len(word) + 1 + word_boundaries[-1]) if len(word_boundaries) != num_words: print(i, len(word_boundaries), num_words) assert len(word_boundaries) == num_words assert ( tokens["source_aux"][i].count("_") + 1 - tokens["source"][i].count("_") == num_words ), ( "Number of words dont match! (line: %d, source: %d, aux: %d)\n%s\n%s" % ( i + 1, num_words, tokens["source_aux"][i].count("_") + 1, " ".join(tokens["source"][i]), " ".join(tokens["source_aux"][i]), ) ) prev_idx = 0 for word_idx, boundary in enumerate(word_boundaries): avg_vector = np.average(activations[i][prev_idx : boundary + 1, :], axis=0) new_activations[word_idx, :] = avg_vector prev_idx = boundary + 2 all_activations.append(new_activations) return all_activations
[docs]def char_get_last_activations(tokens, activations, is_brnn=True): """Aggregates activations by picking the last subword assuming Character-based tokenization. Given loaded tokens data and activations, this function aggeregates activations based on character-tokenized text. The activations are aggregated by picking the last character for any given word. .. warning:: This function is deprecated and will be removed in future versions. Parameters ---------- tokens : dict Dictionary containing three lists, ``source``, ``source_aux`` and ``target``. Usually the output of ``data.loader.load_aux_data``. activations : list of numpy.ndarray Activations returned from ``loader.load_activations``. is_brnn : bool, optional Whether the model from which activations were extracted was bidirectional. Only applies for RNN models. Returns ------- activations : list of numpy.ndarray Character aggregated activations corresponding to one per actual token found in the untokenized text. """ all_activations = [] num_neurons = activations[0].size(1) for i in tqdm(range(0, len(tokens["source_aux"]))): sourceIndex = 0 thisChar = "" source = tokens["source"][i] source_aux = tokens["source_aux"][i] num_words = len(source) new_activations = np.zeros((num_words, num_neurons)) word_boundaries = [] for word_idx, word in enumerate(tokens["source"][i]): if word_idx == 0: word_boundaries.append(len(word) - 1) else: word_boundaries.append(len(word) + 1 + word_boundaries[-1]) if len(word_boundaries) != num_words: print(i, len(word_boundaries), num_words) assert len(word_boundaries) == num_words assert ( tokens["source_aux"][i].count("_") + 1 - tokens["source"][i].count("_") == num_words ), ( "Number of words dont match! (line: %d, source: %d, aux: %d)\n%s\n%s" % ( i + 1, num_words, tokens["source_aux"][i].count("_") + 1, " ".join(tokens["source"][i]), " ".join(tokens["source_aux"][i]), ) ) rnn_boundary = int(num_neurons / 2) if not is_brnn: rnn_boundary = num_neurons prev_idx = 0 for word_idx, boundary in enumerate(word_boundaries): # 0 - num_neurons/2: Forward # num_neurons/2 - : Backward new_activations[word_idx, :rnn_boundary] = activations[i][ boundary, :rnn_boundary ] if is_brnn: new_activations[word_idx, rnn_boundary:] = activations[i][ prev_idx, rnn_boundary: ] prev_idx = boundary + 1 all_activations.append(new_activations) return all_activations
[docs]def sent_get_last_activations(tokens, activations): """Gets the summary vector for the input sentences. Given loaded tokens data and activations, this function picks the final token's activations for every sentence, essentially giving summary vectors for every sentence in the dataset. This is mostly applicable for RNNs. .. note:: Bidirectionality is currently not handled in the case of BiRNNs. Parameters ---------- tokens : dict Dictionary containing three lists, ``source``, ``source_aux`` and ``target``. Usually the output of ``data.loader.load_aux_data``. activations : list of numpy.ndarray Activations returned from ``loader.load_activations``. Returns ------- activations : list of numpy.ndarray Summary activations corresponding to one per actual sentence in the original text. """ all_activations = [] num_neurons = activations[0].size(1) for i in tqdm(range(0, len(tokens["source"]))): source = tokens["source"][i] num_words = len(source) new_activations = np.zeros((1, num_neurons)) new_activations[0, :] = activations[i][-1, :] all_activations.append(new_activations) return all_activations