Source code for neurox.data.annotate

"""Given a list of sentences, their activations and a pattern, create a binary labeled dataset based on the pattern
where pattern can be a regular expression, a list of words and a function. 
For example, one can create a binary dataset of years vs. not-years (2004 vs. this) by specifying the regular expression
that matches the pattern of year. 
The program will extract positive class examples based on the provided filter and will consider rest of the examples as
negative class examples. The output of the program is a word file, a label file and an activation file.

"""
import collections
import numpy as np
import argparse
import sys
from typing import Pattern

import neurox.data.loader as data_loader
import neurox.data.utils as data_utils

def _create_binary_data(tokens, activations, binary_filter, balance_data=False):
    """
    Given a list of tokens, their activations and a binary_filter, create the binary labeled data. A binary filter can be
    a set, regex or a function. The regex option expects output of re.compile.
    
    Parameters
    ----------
    tokens : dictionary
        A dictonary of sentences with their dummy labels. The format is the output of the data_loader.load_data function
    activations: list
        A list of sentence-wise activations 
    binary_filter: a set of words or a regex object or a function
    
    Returns
    -------
    annotated_dataset : tuple of (tokens, labels and activations)
        A list of selected positive and negative class words, their labels (the
        string "positive" for examples matching the filter and "negative" for
        others) and their activations
    
    Example
    -------
    _create_binary_data(tokens, activations, re.compile(r'^\w\w$')) select words of two characters only as a positive class
    _create_binary_data(tokens, activations, {'is', 'can'}) select occrrences of 'is' and 'can' as a positive class
    """
    
    filter_fn = None
    if isinstance(binary_filter, set):
        filter_fn = lambda x: x in binary_filter
    elif isinstance(binary_filter, Pattern):
        filter_fn = lambda x: binary_filter.match(x)
    elif isinstance(binary_filter, collections.Callable):
        filter_fn = binary_filter
    else:
        raise NotImplementedError("ERROR: does not belong to any configuration")
        
    positive_class_words = []
    positive_class_activations = []
    negative_class_words = []    
    negative_class_activations = []
    
    print ("Creating binary dataset ...")
    for s_idx, sentences in enumerate(tokens['source']):
        for w_idx, word in enumerate(sentences):
            if filter_fn(word):
                positive_class_words.append(word)
                positive_class_activations.append(activations[s_idx][w_idx])
            else:
                negative_class_words.append(word)
                negative_class_activations.append(activations[s_idx][w_idx])

    if len(negative_class_words) == 0 or len(positive_class_words) == 0:
        raise ValueError(
            "Positive or Negative class examples are zero"
        )
    elif len(negative_class_words) < len(positive_class_words):
        print ("WARNING: the negative class examples are less than the positive class examples")
        print ("Postive class examples: ", len(positive_class_words), "Negative class examples: ", len(negative_class_words))

    if balance_data:
        negative_class_words, negative_class_activations = data_utils._balance_negative_class(negative_class_words, negative_class_activations, len(positive_class_words))
    
    print ("Number of Positive examples: ", len(positive_class_words))
    
    labels = (['positive']*len(positive_class_words)) + ['negative']*len(negative_class_words)
    
    return positive_class_words+negative_class_words, labels, positive_class_activations+negative_class_activations



[docs]def annotate_data(source_path, activations_path, binary_filter, output_prefix, output_type="hdf5", decompose_layers=False, filter_layers=None): """ Given a set of sentences, per word activations, a binary_filter and output_prefix, creates binary data and save it to the disk. A binary filter can be a set of words, a regex object or a function Parameters ---------- source_path : text file with one sentence per line activations: list A list of sentence-wise activations binary_filter: a set of words or a regex object or a function output_prefix: prefix of the output files that will be saved as the output of this script Returns ------- Saves a word file, a binary label file and their activations Example ------- annotate_data(source_path, activations_path, re.compile(r'^\w\w$')) select words of two characters only as a positive class annotate_data(source_path, activations_path, {'is', 'can'}) select occrrences of 'is' and 'can' as a positive class """ activations, num_layers = data_loader.load_activations(activations_path) # giving source_path instead of labels since labels will be generated later tokens = data_loader.load_data(source_path, source_path, activations, max_sent_l=512) words, labels, activations = _create_binary_data(tokens, activations, binary_filter) activations = [np.swapaxes(a.reshape((a.shape[1], num_layers, -1)), 0, 1) for a in activations] data_utils.save_files(words, labels, activations, output_prefix, output_type, decompose_layers, filter_layers)