"""Loading functions for activations, input tokens/sentences and labels
This module contains functions to load activations as well as source files with
tokens and labels. Functions that support tokenized data are also provided.
"""
import pickle
import json
import h5py
import numpy as np
import torch
[docs]def load_activations(activations_path, num_neurons_per_layer=None, is_brnn=False):
"""Load extracted activations.
Parameters
----------
activations_path : str
Path to the activations file. Can be of type t7, pt, acts, json or hdf5
num_neurons_per_layer : int, optional
Number of neurons per layer - used to compute total number of layers.
This is only necessary in the case of t7/p5/acts activations.
is_brnn : bool, optional
If the model used to extract activations was bidirectional (default: False)
Returns
-------
activations : list of numpy.ndarray
List of *sentence representations*, where each *sentence representation*
is a numpy matrix of shape ``[num tokens in sentence x concatenated representation size]``
num_layers : int
Number of layers. This is usually representation_size/num_neurons_per_layer.
Divide again by 2 if model was bidirectional
"""
file_ext = activations_path.split(".")[-1]
activations = None
num_layers = None
# Load activations based on type
# Also ensure everything is on the CPU
# as activations may have been saved as CUDA variables
if file_ext == "t7":
# t7 loading requires torch < 1.0
print("Loading seq2seq-attn activations from %s..." % (activations_path))
assert (
num_neurons_per_layer is not None
), "t7 activations require num_neurons_per_layer"
from torch.utils.serialization import load_lua
activations = load_lua(activations_path)["encodings"]
activations = [a.cpu() for a in activations]
num_layers = len(activations[0][0]) / num_neurons_per_layer
if is_brnn:
num_layers /= 2
elif file_ext == "pt":
print("Loading OpenNMT-py activations from %s..." % (activations_path))
assert (
num_neurons_per_layer is not None
), "pt activations require num_neurons_per_layer"
activations = torch.load(activations_path)
activations = [
torch.stack([torch.cat(token) for token in sentence]).cpu()
for sentence in activations
]
num_layers = len(activations[0][0]) / num_neurons_per_layer
elif file_ext == "acts":
print("Loading generic activations from %s..." % (activations_path))
assert (
num_neurons_per_layer is not None
), "acts activations require num_neurons_per_layer"
with open(activations_path, "rb") as activations_file:
activations = pickle.load(activations_file)
# Combine all layers sequentially
print("Combining layers " + str([a[0] for a in activations]))
activations = [a[1] for a in activations]
num_layers = len(activations)
num_sentences = len(activations[0])
concatenated_activations = []
for sentence_idx in range(num_sentences):
sentence_acts = []
for layer_idx in range(num_layers):
sentence_acts.append(np.vstack(activations[layer_idx][sentence_idx]))
concatenated_activations.append(np.concatenate(sentence_acts, axis=1))
activations = concatenated_activations
elif file_ext == "hdf5":
print("Loading hdf5 activations from %s..." % (activations_path))
representations = h5py.File(activations_path, "r")
sentence_to_index = json.loads(representations.get("sentence_to_index")[0])
activations = []
# TODO: Check order
for _, value in sentence_to_index.items():
sentence_acts = torch.FloatTensor(representations[value])
num_layers, sentence_length, embedding_size = (
sentence_acts.shape[0],
sentence_acts.shape[1],
sentence_acts.shape[2],
)
num_neurons_per_layer = embedding_size
sentence_acts = np.swapaxes(sentence_acts, 0, 1)
sentence_acts = sentence_acts.reshape(
sentence_length, num_layers * embedding_size
)
activations.append(sentence_acts.numpy())
num_layers = len(activations[0][0]) / num_neurons_per_layer
elif file_ext == "json":
print("Loading json activations from %s..." % (activations_path))
activations = []
with open(activations_path) as fp:
for line in fp:
token_acts = []
sentence_activations = json.loads(line)["features"]
for act in sentence_activations:
num_neurons_per_layer = len(act["layers"][0]["values"])
token_acts.append(
np.concatenate([l["values"] for l in act["layers"]])
)
activations.append(np.vstack(token_acts))
num_layers = activations[0].shape[1] / num_neurons_per_layer
print(len(activations), num_layers)
else:
assert False, "Activations must be of type t7, pt, acts, json or hdf5"
return activations, int(num_layers)
[docs]def filter_activations_by_layers(
train_activations, test_activations, filter_layers, rnn_size, num_layers, is_brnn
):
"""
Filter activations so that they only contain specific layers.
Useful for performing layer-wise analysis.
.. warning::
This function is deprecated and will be removed in future versions.
Parameters
----------
train_activations : list of numpy.ndarray
List of *sentence representations* from the train set, where each
*sentence representation* is a numpy matrix of shape
``[NUM_TOKENS x NUM_NEURONS]``. The method assumes that neurons from
all layers are present, with the number of neurons in every layer given
by ``rnn_size``
test_activations : list of numpy.ndarray
Similar to ``train_activations`` but with sentences from a test set.
filter_layers : str
A comma-separated string of the form "f1,f2,f10". "f" indicates a "forward"
layer while "b" indicates a backword layer in a Bidirectional RNN. If the
activations are from different kind of model, set ``is_brnn`` to ``False``
and provide only "f" entries. The number next to "f" is the layer number,
1-indexed. So "f1" corresponds to the embedding layer and so on.
rnn_size : int
Number of neurons in every layer.
num_layers : int
Total number of layers in the original model.
is_brnn : bool
Boolean indicating if the neuron activations are from a bidirectional model.
Returns
-------
filtered_train_activations : list of numpy.ndarray
Filtered train activations
filtered_test_activations : list of numpy.ndarray
Filtered test activations
Notes
-----
For bidirectional models, the method assumes that the internal structure is
as follows: forward layer 1 neurons, backward layer 1 neurons, forward layer
2 neurons ...
"""
_layers = filter_layers.split(",")
layer_prefixes = ["f"]
if is_brnn:
layer_prefixes = ["f", "b"]
# FILTER settings
layers = list(
range(1, num_layers + 1)
) # choose which layers you need the activations
filtered_train_activations = None
filtered_test_activations = None
layers_idx = []
for brnn_idx, b in enumerate(layer_prefixes):
for l in layers:
if "%s%d" % (b, l) in _layers:
start_idx = brnn_idx * (num_layers * rnn_size) + (l - 1) * rnn_size
end_idx = brnn_idx * (num_layers * rnn_size) + (l) * rnn_size
print(
"Including neurons from %s%d(#%d to #%d)"
% (b, l, start_idx, end_idx)
)
layers_idx.append(np.arange(start_idx, end_idx))
layers_idx = np.concatenate(layers_idx)
filtered_train_activations = [a[:, layers_idx] for a in train_activations]
filtered_test_activations = [a[:, layers_idx] for a in test_activations]
return filtered_train_activations, filtered_test_activations
[docs]def load_aux_data(
source_path,
labels_path,
source_aux_path,
activations,
max_sent_l,
ignore_start_token=False,
):
"""Load word-annotated text-label pairs data represented as sentences, where
activation extraction was performed on tokenized text. This function loads
the source text, source tokenized text, target labels, and activations and
tries to make them perfectly parallel, i.e. number of tokens in line N of
source would match the number of tokens in line N of target, and number of
tokens in source_aux will match the number of activations at index N.
The method will delete non-matching activation/source/source_aix/target
pairs, up to a maximum of 100 before failing. The method will also ignore
sentences longer than the provided maximum. The activations will be modified
in place.
.. warning::
This function is deprecated and will be removed in future versions.
Parameters
----------
source_path : str
Path to the source text file, one sentence per line
labels_path : str
Path to the annotated labels file, one sentence per line corresponding to
the sentences in the ``source_path`` file.
source_aux_path : str
Path to the source text file with tokenization, one sentence per line
activations : list of numpy.ndarray
Activations returned from ``loader.load_activations``
max_sent_l : int
Maximum length of sentences. Sentences containing more tokens will be
ignored.
ignore_start_token : bool, optional
Ignore the first token. Useful if there is some line position markers
in the source text.
Returns
-------
tokens : dict
Dictionary containing three lists, ``source``, ``source_aux`` and
``target``. ``source`` contains all of the sentences from``source_path``
that were not ignored. ``source_aux`` contains all tokenized sentences
from ``source_aux_path``. ``target`` contains the parallel set of
annotated labels.
"""
tokens = {"source_aux": [], "source": [], "target": []}
skipped_lines = set()
with open(source_aux_path) as source_aux_fp:
for line_idx, line in enumerate(source_aux_fp):
line_tokens = line.strip().split()
if len(line_tokens) > max_sent_l:
print("Skipping line #%d because of length (aux)" % (line_idx))
skipped_lines.add(line_idx)
if ignore_start_token:
line_tokens = line_tokens[1:]
activations[line_idx] = activations[line_idx][1:, :]
tokens["source_aux"].append(line_tokens)
with open(source_path) as source_fp:
for line_idx, line in enumerate(source_fp):
line_tokens = line.strip().split()
if len(line_tokens) > max_sent_l:
print("Skipping line #%d because of length (source)" % (line_idx))
skipped_lines.add(line_idx)
if ignore_start_token:
line_tokens = line_tokens[1:]
tokens["source"].append(line_tokens)
with open(labels_path) as labels_fp:
for line_idx, line in enumerate(labels_fp):
line_tokens = line.strip().split()
if len(line_tokens) > max_sent_l:
print("Skipping line #%d because of length (label)" % (line_idx))
skipped_lines.add(line_idx)
if ignore_start_token:
line_tokens = line_tokens[1:]
tokens["target"].append(line_tokens)
assert len(tokens["source_aux"]) == len(tokens["source"]) and len(
tokens["source_aux"]
) == len(tokens["target"]), (
"Number of lines do not match (source: %d, aux: %d, target: %d)!"
% (len(tokens["source"]), len(tokens["source_aux"]), len(tokens["target"]))
)
assert len(activations) == len(tokens["source"]), (
"Number of lines do not match (activations: %d, source: %d)!"
% (len(activations), len(tokens["source"]))
)
for num_deleted, line_idx in enumerate(sorted(skipped_lines)):
print("Deleting skipped line %d" % (line_idx))
del tokens["source_aux"][line_idx]
del tokens["source"][line_idx]
del tokens["target"][line_idx]
# Check if all data is well formed (whether we have activations + labels for each
# and every word)
invalid_activation_idx = []
for idx, activation in enumerate(activations):
if activation.shape[0] == len(tokens["source_aux"][idx]) and len(
tokens["source"][idx]
) == len(tokens["target"][idx]):
pass
else:
invalid_activation_idx.append(idx)
print(
"Skipping line: ",
idx,
"A: %d, aux: %d, src: %d, tgt: %s"
% (
activation.shape[0],
len(tokens["source_aux"][idx]),
len(tokens["source"][idx]),
len(tokens["target"][idx]),
),
)
assert len(invalid_activation_idx) < 100, (
"Too many mismatches (%d) - your paths are probably incorrect or something is wrong in the data!"
% (len(invalid_activation_idx))
)
for num_deleted, idx in enumerate(invalid_activation_idx):
print(
"Deleting line %d: %d activations, %s aux, %d source, %d target"
% (
idx - num_deleted,
activations[idx - num_deleted].shape[0],
len(tokens["source_aux"][idx - num_deleted]),
len(tokens["source"][idx - num_deleted]),
len(tokens["target"][idx - num_deleted]),
)
)
del activations[idx - num_deleted]
del tokens["source_aux"][idx - num_deleted]
del tokens["source"][idx - num_deleted]
del tokens["target"][idx - num_deleted]
for idx, activation in enumerate(activations):
assert activation.shape[0] == len(tokens["source_aux"][idx])
assert len(tokens["source"][idx]) == len(tokens["target"][idx])
return tokens
[docs]def load_data(
source_path,
labels_path,
activations,
max_sent_l,
ignore_start_token=False,
sentence_classification=False,
):
"""Load word-annotated text-label pairs data represented as sentences. This
function loads the source text, target labels, and activations and tries to
make them perfectly parallel, i.e. number of tokens in line N of source would
match the number of tokens in line N of target, and also match the number of
activations at index N. The method will delete non-matching activation/source/target
pairs, up to a maximum of 100 before failing. The method will also ignore
sentences longer than the provided maximum. The activations will be modified
in place.
Parameters
----------
source_path : str
Path to the source text file, one sentence per line
labels_path : str
Path to the annotated labels file, one sentence per line corresponding to
the sentences in the ``source_path`` file.
activations : list of numpy.ndarray
Activations returned from ``loader.load_activations``
max_sent_l : int
Maximum length of sentences. Sentences containing more tokens will be
ignored.
ignore_start_token : bool, optional
Ignore the first token. Useful if there is some line position markers
in the source text.
sentence_classification : bool, optional
Flag to indicate if this is a sentence classification task, where every
sentence actually has only a single activation (e.g. [CLS] token's
activations in the case of BERT)
Returns
-------
tokens : dict
Dictionary containing two lists, ``source`` and ``target``. ``source``
contains all of the sentences from ``source_path`` that were not ignored.
``target`` contains the parallel set of annotated labels.
"""
tokens = {"source": [], "target": []}
with open(source_path) as source_fp:
for line_idx, line in enumerate(source_fp):
line_tokens = line.strip().split()
if len(line_tokens) > max_sent_l:
continue
if ignore_start_token:
line_tokens = line_tokens[1:]
activations[line_idx] = activations[line_idx][1:, :]
tokens["source"].append(line_tokens)
with open(labels_path) as labels_fp:
for line in labels_fp:
line_tokens = line.strip().split()
if len(line_tokens) > max_sent_l:
continue
if ignore_start_token:
line_tokens = line_tokens[1:]
tokens["target"].append(line_tokens)
assert len(tokens["source"]) == len(tokens["target"]), (
"Number of lines do not match (source: %d, target: %d)!"
% (len(tokens["source"]), len(tokens["target"]))
)
assert len(activations) == len(tokens["source"]), (
"Number of lines do not match (activations: %d, source: %d)!"
% (len(activations), len(tokens["source"]))
)
# Check if all data is well formed (whether we have activations + labels for
# each and every word)
invalid_activation_idx = []
for idx, activation in enumerate(activations):
if activation.shape[0] == len(tokens["source"][idx]) and (
sentence_classification or activation.shape[0] == len(tokens["target"][idx])
):
pass
else:
invalid_activation_idx.append(idx)
print("Skipping line: ", idx)
print(
"A: %d, S: %d, T: %d"
% (
activation.shape[0],
len(tokens["source"][idx]),
len(tokens["target"][idx]),
)
)
assert len(invalid_activation_idx) < 100, (
"Too many mismatches (%d) - your paths are probably incorrect or something is wrong in the data!"
% (len(invalid_activation_idx))
)
for num_deleted, idx in enumerate(invalid_activation_idx):
print(
"Deleting line %d: %d activations, %d source, %d target"
% (
idx - num_deleted,
activations[idx - num_deleted].shape[0],
len(tokens["source"][idx - num_deleted]),
len(tokens["target"][idx - num_deleted]),
)
)
del activations[idx - num_deleted]
del tokens["source"][idx - num_deleted]
del tokens["target"][idx - num_deleted]
for idx, activation in enumerate(activations):
assert activation.shape[0] == len(tokens["source"][idx])
if not sentence_classification:
assert activation.shape[0] == len(tokens["target"][idx])
# TODO: Return activations
return tokens
[docs]def load_sentence_data(source_path, labels_path, activations):
"""Loads sentence-annotated text-label pairs. This function loads the source
text, target labels, and activations and tries to make them perfectly
parallel, i.e. number of tokens in line N of source would
match the number of activations at index N. The method will delete
non-matching activation/source pairs. The activations will be modified
in place.
Parameters
----------
source_path : str
Path to the source text file, one sentence per line
labels_path : str
Path to the annotated labels file, one sentence per line corresponding to
the sentences in the ``source_path`` file.
activations : list of numpy.ndarray
Activations returned from ``loader.load_activations``
Returns
-------
tokens : dict
Dictionary containing two lists, ``source`` and ``target``. ``source``
contains all of the sentences from ``source_path`` that were not ignored.
``target`` contains the parallel set of annotated labels.
"""
tokens = {"source": [], "target": []}
with open(source_path) as source_fp:
for line_idx, line in enumerate(source_fp):
tokens["source"].append(["sentence_%d" % (line_idx)])
with open(labels_path) as labels_fp:
for line in labels_fp:
line_tokens = line.strip().split()
tokens["target"].append(line_tokens)
assert len(tokens["source"]) == len(tokens["target"]), (
"Number of lines do not match (source: %d, target: %d)!"
% (len(tokens["source"]), len(tokens["target"]))
)
assert len(activations) == len(tokens["source"]), (
"Number of lines do not match (activations: %d, source: %d)!"
% (len(activations), len(tokens["source"]))
)
# Check if all data is well formed (whether we have activations + labels for
# each and every word)
for idx, activation in enumerate(activations):
assert activation.shape[0] == len(tokens["source"][idx])
return tokens