Source code for textbrewer.data_utils

import numpy as np
import random

[docs]def masking(tokens, p = 0.1, mask='[MASK]'): """ Returns a new list by replacing elements in `tokens` by `mask` with probability `p`. Args: tokens (list): list of tokens or token ids. p (float): probability to mask each element in `tokens`. Returns: A new list by replacing elements in `tokens` by `mask` with probability `p`. """ outputs = tokens[:] for i in range(len(tokens)): if np.random.rand() < p: outputs[i] = mask return outputs
[docs]def deleting(tokens, p = 0.1): """ Returns a new list by deleting elements in `tokens` with probability `p`. Args: tokens (list): list of tokens or token ids. p (float): probability to delete each element in `tokens`. Retunrns: a new list by deleting elements in :`tokens` with probability `p`. """ choice = np.random.binomial(1,1-p,len(tokens)) outputs = [tokens[i] for i in range(len(tokens)) if choice[i]==1] return outputs
[docs]def n_gram_sampling(tokens, p_ng = [0.2,0.2,0.2,0.2,0.2], l_ng = [1,2,3,4,5]): """ Samples a length `l` from `l_ng` with probability distribution `p_ng`, then returns a random span of length `l` from `tokens`. Args: tokens (list): list of tokens or token ids. p_ng (list): probability distribution of the n-grams, should sum to 1. l_ng (list): specify the n-grams. Returns: a n-gram random span from `tokens`. """ span_length = np.random.choice(l_ng,p= p_ng) start_position = max(0,np.random.randint(0,len(tokens)-span_length+1)) n_gram_span = tokens[start_position:start_position+span_length] return n_gram_span
[docs]def short_disorder(tokens, p = [0.9,0.1,0,0,0]): # untouched + four cases abc, bac, cba, cab, bca """ Returns a new list by disordering `tokens` with probability distribution `p` at every possible position. Let `abc` be a 3-gram in `tokens`, there are five ways to disorder, corresponding to five probability values: | abc -> abc | abc -> bac | abc -> cba | abc -> cab | abc -> bca Args: tokens (list): list of tokens or token ids. p (list): probability distribution of 5 disorder types, should sum to 1. Returns: a new disordered list """ i = 0 outputs = tokens[:] l = len(tokens) while i < l-1: permutation = np.random.choice([0,1,2,3,4],p=p) if permutation!=0 and i==l-2: outputs[i], outputs[i+1] = outputs[i+1], outputs[i] i += 2 elif permutation==1: outputs[i], outputs[i+1] = outputs[i+1], outputs[i] i += 2 elif permutation==2: outputs[i], outputs[i+2] = outputs[i+2], outputs[i] i +=3 elif permutation==3: outputs[i],outputs[i+1],outputs[i+2] = outputs[i+2],outputs[i],outputs[i+1] i += 3 elif permutation==4: outputs[i],outputs[i+1],outputs[i+2] = outputs[i+1],outputs[i+2],outputs[i] i += 3 else: i += 1 return outputs
[docs]def long_disorder(tokens,p = 0.1, length=20): """ Performs a long-range disordering. If ``length>1``, then swaps the two halves of each span of length `length` in `tokens`; if ``length<=1``, treats `length` as the relative length. For example:: >>>long_disorder([0,1,2,3,4,5,6,7,8,9,10], p=1, length=0.4) [2, 3, 0, 1, 6, 7, 4, 5, 8, 9] Args: tokens (list): list of tokens or token ids. p (list): probability to swaps the two halves of a spans at possible positions. length (int or float): length of the disordered span. Returns: a new disordered list """ outputs = tokens[:] if int(length) <= 1: length = len(tokens)*length length = (int(length)+1) //2 * 2 i = 0 while i<=len(outputs)-length: if np.random.rand() < p: outputs[i:i+length//2], outputs[i+length//2:i+length] = outputs[i+length//2:i+length], outputs[i:i+length//2] i += length else: i += 1 return outputs