Source code for neurox.data.writer

"""Representations Writers

Module with various writers for saving representations/activations. Currently,
two file types are supported:

1. ``hdf5``: This is a binary format, and results in smaller overall files.
   The structure of the file is as follows:

   * ``sentence_to_idx`` dataset: Contains a single json string at index 0 that
     maps sentences to indices
   * Indices ``0`` through ``N-1`` datasets: Each index corresponds to one
     sentence. The value of the dataset is a tensor with dimensions
     ``num_layers x sentence_length x embedding_size``, where ``embedding_size``
     may include multiple layers
2. ``json``: This is a human-readable format. There is some loss of precision,
   since each activation value is saved using 8 decimal places. Concretely, this
   results in a jsonl file, where each line is a json string corresponding to a
   single sentence. The structure of each line is as follows:

   * ``linex_idx``: Sentence index
   * ``features``: List of tokens (with their activations)

     * ``token``: The current token
     * ``layers``: List of layers

       * ``index``: Layer index (does not correspond to original model's layers)
       * ``values``: List of activation values for all neurons in the layer

The writers also support saving activations from specific layers only, using the
``filter_layers`` argument. Since activation files can be large, an additional
option for decomposing the representations into layer-wise files is also
provided.
"""

import argparse
import collections
import json

import h5py


[docs]class ActivationsWriter:
    """
    Class that encapsulates all available writers.

    This is the only class that should be used by the rest of the library.

    Attributes
    ----------
    filename : str
        Filename for storing the activations. May not be used exactly if
        ``decompose_layers`` is True.
    filetype : str
        An additional hint for the filetype. This argument is optional
        The file type will be detected automatically from the filename if
        none is supplied.
    decompose_layers : bool
        Set to true if each layer's activations should be saved in a
        separate file.
    filter_layers : str
        Comma separated list of layer indices to save.
    """

[docs]    def __init__(
        self, filename, filetype=None, decompose_layers=False, filter_layers=None
    ):
        self.filename = filename
        self.decompose_layers = decompose_layers
        self.filter_layers = filter_layers

[docs]    def open(self):
        """
        Method to open the underlying files. Will be called automatically
        by the class instance when necessary.
        """
        raise NotImplementedError("Use a specific writer or the `get_writer` method.")

[docs]    def write_activations(self, sentence_idx, extracted_words, activations):
        """Method to write a single sentence's activations to file"""
        raise NotImplementedError("Use a specific writer or the `get_writer` method.")

[docs]    def close(self):
        """Method to close the udnerlying files."""
        raise NotImplementedError("Use a specific writer or the `get_writer` method.")

[docs]    @staticmethod
    def get_writer(filename, filetype=None, decompose_layers=False, filter_layers=None):
        """Method to get the correct writer based on filename and filetype"""
        return ActivationsWriterManager(
            filename, filetype, decompose_layers, filter_layers
        )

[docs]    @staticmethod
    def add_writer_options(parser):
        """Method to return argparse arguments specific to activation writers"""
        parser.add_argument(
            "--output_type",
            choices=["autodetect", "hdf5", "json"],
            default="autodetect",
            help="Output format of the extracted representations. Default autodetects based on file extension.",
        )
        parser.add_argument(
            "--decompose_layers",
            action="store_true",
            help="Save activations from each layer in a separate file",
        )
        parser.add_argument(
            "--filter_layers",
            default=None,
            type=str,
            help="Comma separated list of layers to save activations for. The layers will be saved in the order specified in this argument.",
        )


[docs]class ActivationsWriterManager(ActivationsWriter):
    """
    Manager class that handles decomposition and filtering.

    Decomposition requires multiple writers (one per file) and filtering
    requires processing the activations to remove unneeded layer activations.
    This class sits on top of the actual activations writer to manage these
    operations.
    """

[docs]    def __init__(
        self, filename, filetype=None, decompose_layers=False, filter_layers=None
    ):
        super().__init__(
            filename,
            filetype=filetype,
            decompose_layers=decompose_layers,
            filter_layers=filter_layers,
        )

        if filename.endswith(".hdf5") or filetype == "hdf5":
            self.base_writer = HDF5ActivationsWriter
        elif filename.endswith(".json") or filetype == "json":
            self.base_writer = JSONActivationsWriter
        else:
            raise NotImplementedError("filetype not supported. Use `hdf5` or `json`.")

        self.filename = filename
        self.layers = None
        self.writers = None

[docs]    def open(self, num_layers):
        self.layers = list(range(num_layers))
        self.writers = []
        if self.filter_layers:
            self.layers = [int(l) for l in self.filter_layers.split(",")]
        if self.decompose_layers:
            for layer_idx in self.layers:
                local_filename = f"{self.filename[:-5]}-layer{layer_idx}.{self.filename[-4:]}"
                _writer = self.base_writer(local_filename)
                _writer.open()
                self.writers.append(_writer)
        else:
            _writer = self.base_writer(self.filename)
            _writer.open()
            self.writers.append(_writer)

[docs]    def write_activations(self, sentence_idx, extracted_words, activations):
        if self.writers is None:
            self.open(activations.shape[0])

        if self.decompose_layers:
            for writer_idx, layer_idx in enumerate(self.layers):
                self.writers[writer_idx].write_activations(
                    sentence_idx, extracted_words, activations[[layer_idx], :, :]
                )
        else:
            self.writers[0].write_activations(
                sentence_idx, extracted_words, activations[self.layers, :, :]
            )

[docs]    def close(self):
        for writer in self.writers:
            writer.close()


[docs]class HDF5ActivationsWriter(ActivationsWriter):
[docs]    def __init__(self, filename):
        super().__init__(filename, filetype="hdf5")
        if not self.filename.endswith(".hdf5"):
            raise ValueError(
                f"Output filename ({self.filename}) does not end with .hdf5, but output file type is hdf5."
            )
        self.activations_file = None

[docs]    def open(self):
        self.activations_file = h5py.File(self.filename, "w")
        self.sentence_to_index = {}

[docs]    def write_activations(self, sentence_idx, extracted_words, activations):
        if self.activations_file is None:
            self.open()

        self.activations_file.create_dataset(
            str(sentence_idx), activations.shape, dtype="float32", data=activations
        )

        # TODO: Replace with better implementation with list of indices
        sentence = " ".join(extracted_words)
        final_sentence = sentence
        counter = 1
        while final_sentence in self.sentence_to_index:
            counter += 1
            final_sentence = f"{sentence} (Occurrence {counter})"
        sentence = final_sentence
        self.sentence_to_index[sentence] = str(sentence_idx)

[docs]    def close(self):
        sentence_index_dataset = self.activations_file.create_dataset(
            "sentence_to_index", (1,), dtype=h5py.special_dtype(vlen=str)
        )
        sentence_index_dataset[0] = json.dumps(self.sentence_to_index)
        self.activations_file.close()


[docs]class JSONActivationsWriter(ActivationsWriter):
[docs]    def __init__(self, filename):
        super().__init__(filename, filetype="json")
        if not self.filename.endswith(".json"):
            raise ValueError(
                f"Output filename ({self.filename}) does not end with .json, but output file type is json."
            )

        self.activations_file = None

[docs]    def open(self):
        self.activations_file = open(self.filename, "w", encoding="utf-8")

[docs]    def write_activations(self, sentence_idx, extracted_words, activations):
        if self.activations_file is None:
            self.open()

        output_json = collections.OrderedDict()
        output_json["linex_index"] = sentence_idx
        all_out_features = []

        for word_idx, extracted_word in enumerate(extracted_words):
            all_layers = []
            for layer_idx in range(activations.shape[0]):
                layers = collections.OrderedDict()
                layers["index"] = layer_idx
                layers["values"] = [
                    round(x.item(), 8) for x in activations[layer_idx, word_idx, :]
                ]
                all_layers.append(layers)
            out_features = collections.OrderedDict()
            out_features["token"] = extracted_word
            out_features["layers"] = all_layers
            all_out_features.append(out_features)
        output_json["features"] = all_out_features
        self.activations_file.write(json.dumps(output_json) + "\n")

[docs]    def close(self):
        self.activations_file.close()
NeuroX toolkit documentation

Source code for neurox.data.writer