import matplotlib.pyplot as plt import transformers.generation_tf_utils as ge import numpy as np import tensorflow as tf def generate_modif( self, input_ids=None, max_length=None, min_length=None, do_sample=None, early_stopping=None, num_beams=None, temperature=None, top_k=None, top_p=None, repetition_penalty=None, bad_words_ids=None, bos_token_id=None, pad_token_id=None, eos_token_id=None, length_penalty=None, no_repeat_ngram_size=None, num_return_sequences=None, attention_mask=None, decoder_start_token_id=None, use_cache=None, forced_bos_token_id=None, forced_eos_token_id=None, tokenizer=None, VERBOSE=None, probaMode=None, force2nd=None, ): # We cannot generate if the model does not have a LM head if self.get_output_embeddings() is None: raise AttributeError( "You tried to generate sequences with a model that does not have a LM Head." "Please use another model class (e.g. `TFOpenAIGPTLMHeadModel`, `TFXLNetLMHeadModel`, `TFGPT2LMHeadModel`, `TFCTRLLMHeadModel`, `TFT5ForConditionalGeneration`, `TFTransfoXLLMHeadModel`)" ) max_length = max_length if max_length is not None else self.config.max_length min_length = min_length if min_length is not None else self.config.min_length do_sample = do_sample if do_sample is not None else self.config.do_sample early_stopping = early_stopping if early_stopping is not None else self.config.early_stopping num_beams = num_beams if num_beams is not None else self.config.num_beams temperature = temperature if temperature is not None else self.config.temperature top_k = top_k if top_k is not None else self.config.top_k top_p = top_p if top_p is not None else self.config.top_p repetition_penalty = repetition_penalty if repetition_penalty is not None else self.config.repetition_penalty bos_token_id = bos_token_id if bos_token_id is not None else self.config.bos_token_id pad_token_id = pad_token_id if pad_token_id is not None else self.config.pad_token_id eos_token_id = eos_token_id if eos_token_id is not None else self.config.eos_token_id length_penalty = length_penalty if length_penalty is not None else self.config.length_penalty no_repeat_ngram_size = ( no_repeat_ngram_size if no_repeat_ngram_size is not None else self.config.no_repeat_ngram_size ) bad_words_ids = bad_words_ids if bad_words_ids is not None else self.config.bad_words_ids num_return_sequences = ( num_return_sequences if num_return_sequences is not None else self.config.num_return_sequences ) decoder_start_token_id = ( decoder_start_token_id if decoder_start_token_id is not None else self.config.decoder_start_token_id ) forced_bos_token_id = ( forced_bos_token_id if forced_bos_token_id is not None else self.config.forced_bos_token_id ) forced_eos_token_id = ( forced_eos_token_id if forced_eos_token_id is not None else self.config.forced_eos_token_id ) if input_ids is not None: batch_size = ge.shape_list(input_ids)[0] # overridden by the input batch_size else: batch_size = 1 assert isinstance(max_length, int) and max_length > 0, "`max_length` should be a strictly positive integer." assert isinstance(min_length, int) and min_length >= 0, "`min_length` should be a positive integer." assert isinstance(do_sample, bool), "`do_sample` should be a boolean." assert isinstance(early_stopping, bool), "`early_stopping` should be a boolean." assert isinstance(num_beams, int) and num_beams > 0, "`num_beams` should be a strictly positive integer." assert temperature > 0, "`temperature` should be strictly positive." assert isinstance(top_k, int) and top_k >= 0, "`top_k` should be a positive integer." assert 0 <= top_p <= 1, "`top_p` should be between 0 and 1." assert repetition_penalty >= 1.0, "`repetition_penalty` should be >= 1." assert input_ids is not None or ( isinstance(bos_token_id, int) and bos_token_id >= 0 ), "If input_ids is not defined, `bos_token_id` should be a positive integer." assert pad_token_id is None or ( isinstance(pad_token_id, int) and (pad_token_id >= 0) ), "`pad_token_id` should be a positive integer." assert (eos_token_id is None) or ( isinstance(eos_token_id, int) and (eos_token_id >= 0) ), "`eos_token_id` should be a positive integer." assert length_penalty > 0, "`length_penalty` should be strictly positive." assert ( isinstance(num_return_sequences, int) and num_return_sequences > 0 ), "`num_return_sequences` should be a strictly positive integer." assert ( bad_words_ids is None or isinstance(bad_words_ids, list) and isinstance(bad_words_ids[0], list) ), "`bad_words_ids` is either `None` or a list of lists of tokens that should not be generated" if input_ids is None: assert isinstance(bos_token_id, int) and bos_token_id >= 0, ( "you should either supply a context to complete as `input_ids` input " "or a `bos_token_id` (integer >= 0) as a first token to start the generation." ) input_ids = tf.fill((batch_size, 1), bos_token_id) else: assert len(ge.shape_list(input_ids)) == 2, "Input prompt should be of shape (batch_size, sequence length)." # not allow to duplicate outputs when greedy decoding if do_sample is False: if num_beams == 1: # no_beam_search greedy generation conditions assert ( num_return_sequences == 1 ), "Greedy decoding will always produce the same output for num_beams == 1 and num_return_sequences > 1. Please set num_return_sequences = 1" else: # beam_search greedy generation conditions assert ( num_beams >= num_return_sequences ), "Greedy beam search decoding cannot return more sequences than it has beams. Please set num_beams >= num_return_sequences" # create attention mask if necessary # TODO (PVP): this should later be handled by the forward fn() in each model in the future see PR 3140 if (attention_mask is None) and (pad_token_id is not None) and (pad_token_id in input_ids.numpy()): attention_mask = tf.cast(tf.math.not_equal(input_ids, pad_token_id), dtype=tf.int32) elif attention_mask is None: attention_mask = tf.ones_like(input_ids) if pad_token_id is None and eos_token_id is not None: if False: ge.logger.warning( "Setting `pad_token_id` to {} (first `eos_token_id`) to generate sequence".format(eos_token_id) ) pad_token_id = eos_token_id # current position and vocab size cur_len = ge.shape_list(input_ids)[1] # unused vocab_size = self.config.vocab_size # set effective batch size and effective batch multiplier according to do_sample if do_sample: effective_batch_size = batch_size * num_return_sequences effective_batch_mult = num_return_sequences else: effective_batch_size = batch_size effective_batch_mult = 1 if self.config.is_encoder_decoder: if decoder_start_token_id is None: decoder_start_token_id = bos_token_id assert ( decoder_start_token_id is not None ), "decoder_start_token_id or bos_token_id has to be defined for encoder-decoder generation" assert hasattr(self, "get_encoder"), "{} should have a 'get_encoder' function defined".format(self) assert callable(self.get_encoder), "{} should be a method".format(self.get_encoder) # get encoder and store encoder outputs encoder = self.get_encoder() encoder_outputs = encoder(input_ids, attention_mask=attention_mask) # Expand input ids if num_beams > 1 or num_return_sequences > 1 if num_return_sequences > 1 or num_beams > 1: input_ids_len = ge.shape_list(input_ids)[-1] input_ids = tf.broadcast_to( tf.expand_dims(input_ids, 1), (batch_size, effective_batch_mult * num_beams, input_ids_len) ) attention_mask = tf.broadcast_to( tf.expand_dims(attention_mask, 1), (batch_size, effective_batch_mult * num_beams, input_ids_len) ) input_ids = tf.reshape( input_ids, (effective_batch_size * num_beams, input_ids_len) ) # shape: (batch_size * num_return_sequences * num_beams, cur_len) attention_mask = tf.reshape( attention_mask, (effective_batch_size * num_beams, input_ids_len) ) # shape: (batch_size * num_return_sequences * num_beams, cur_len) if self.config.is_encoder_decoder: # create empty decoder_input_ids input_ids = ( tf.ones( (effective_batch_size * num_beams, 1), dtype=tf.int32, ) * decoder_start_token_id ) cur_len = 1 assert ( batch_size == encoder_outputs[0].shape[0] ), f"expected encoder_outputs[0] to have 1st dimension bs={batch_size}, got {encoder_outputs[0].shape[0]} " # expand batch_idx to assign correct encoder output for expanded input_ids (due to num_beams > 1 and num_return_sequences > 1) expanded_batch_idxs = tf.reshape( tf.repeat(tf.expand_dims(tf.range(batch_size), -1), repeats=num_beams * effective_batch_mult, axis=1), shape=(-1,), ) # expand encoder_outputs encoder_outputs = (tf.gather(encoder_outputs[0], expanded_batch_idxs, axis=0),) else: encoder_outputs = None cur_len = ge.shape_list(input_ids)[-1] assert ( cur_len < max_length ), f"The context has {cur_len} number of tokens, but `max_length` is only {max_length}. Please make sure that `max_length` is bigger than the number of tokens, by setting either `generate(max_length=...,...)` or `config.max_length = ...`" output = self._generate_no_beam_search( input_ids, cur_len=cur_len, max_length=max_length, min_length=min_length, do_sample=do_sample, temperature=temperature, top_k=top_k, top_p=top_p, repetition_penalty=repetition_penalty, no_repeat_ngram_size=no_repeat_ngram_size, bad_words_ids=bad_words_ids, pad_token_id=pad_token_id, eos_token_id=eos_token_id, batch_size=effective_batch_size, vocab_size=vocab_size, encoder_outputs=encoder_outputs, attention_mask=attention_mask, use_cache=use_cache, tokenizer=tokenizer, VERBOSE=VERBOSE, probaMode=probaMode, num_beams=num_beams, force2nd=force2nd, ) return output def do_hist(x, title=""): if False: # get number of values below 1000 (assume this is -inf) res = tf.math.count_nonzero(tf.greater_equal(-1000, x)) x = x[tf.math.is_finite(x)] # x = tf.where(x > - 500, x, -500) num_bins = 20 # the histogram of the data n, bins, patches = plt.hist(x, num_bins, facecolor='blue') plt.xlabel('assigned proba') plt.ylabel('occurence') plt.title(r'{}: logistics distribution, # neglible tokens = {}'.format(title, res)) # Tweak spacing to prevent clipping of ylabel plt.subplots_adjust(left=0.15) plt.show() def getNewProba(probaMode, oldValue, nextValue, nb_gen=None): if probaMode == "longOk": newValue = float(np.power( oldValue, (nb_gen - 1.0) / nb_gen) * np.power(nextValue, 1.0 / nb_gen) ) return newValue elif probaMode == "mult": return oldValue*nextValue else: NotImplementedError("This probability mode is not yet implemented") print("def generate no beam search modif") def _generate_no_beam_search_modif( self, input_ids, cur_len, max_length, min_length, do_sample, temperature, top_k, top_p, repetition_penalty, no_repeat_ngram_size, bad_words_ids, pad_token_id, eos_token_id, batch_size, vocab_size, encoder_outputs, attention_mask, use_cache, tokenizer, VERBOSE, probaMode, num_beams, force2nd, **kwargs ): """ Generate sequences for each example without beam search (num_beams == 1). All returned sequence are generated independantly. """ answers = [] scores = [] forbiden_first_token = [] context = tokenizer.decode(input_ids[0]) if VERBOSE == True: print("context is : {}".format(context)) for beam_nb in range(num_beams): answer = "" FIRST = True if force2nd: EOS_1st = True else: EOS_1st = False nb_gen = 0 output_score = 1 # length of generated sentences / unfinished sentences unfinished_sents = tf.ones_like(input_ids[:, 0]) sent_lengths = tf.ones_like(input_ids[:, 0]) * max_length past = encoder_outputs # defined for encoder-decoder models, None for decoder-only models while cur_len < max_length: nb_gen += 1 model_inputs = self.prepare_inputs_for_generation( input_ids, past=past, attention_mask=attention_mask, use_cache=use_cache, **kwargs ) outputs = self(**model_inputs) next_token_logits = outputs[0][:, -1, :] do_hist(next_token_logits[0], "init") # if model has past, then set the past variable to speed up decoding if self._use_cache(outputs, use_cache): past = outputs[1] # keep it to test stability, but to get best performance don't use it # repetition penalty from CTRL paper (https://arxiv.org/abs/1909.05858) if repetition_penalty != 1.0: next_token_logits_penalties = ge._create_next_token_logits_penalties( input_ids, next_token_logits, repetition_penalty ) next_token_logits = tf.math.multiply(next_token_logits, next_token_logits_penalties) logistics_copy = tf.identity(next_token_logits) nb_fails = 0 while True: # restore logistics, if loop is done several times next_token_logits = tf.identity(logistics_copy) if do_sample: # Temperature (higher temperature => more likely to sample low probability tokens) if temperature != 1.0: next_token_logits = next_token_logits / temperature do_hist(next_token_logits[0], "temp") # Top-p/top-k filtering print("topk = {}, top_p = {}".format(top_k, top_p)) next_token_logits = ge.tf_top_k_top_p_filtering(next_token_logits, top_k=top_k, top_p=top_p) do_hist(next_token_logits[0], "top p/k") # Sample next_token = tf.squeeze( tf.random.categorical(next_token_logits, dtype=tf.int32, num_samples=1), axis=1 ) else: # Greedy decoding next_token = tf.math.argmax(next_token_logits, axis=-1, output_type=tf.int32) # is next token eos or in the sequence? if VERBOSE == True: print("The next token is {}".format(tokenizer.decode([int(next_token)]))) tmp_answer = answer + tokenizer.decode([int(next_token)]) forceReuse = True if not forceReuse: print("reuse not enforced") break remove_token = False # if end is detected if int(next_token) == eos_token_id: if EOS_1st: EOS_1st = False probs = tf.nn.softmax(next_token_logits[0]) next_score = probs[int(next_token)] if answer not in answers: answers.append(answer) if answers[-1] not in ["", "-"]: scores.append(getNewProba(probaMode, output_score, next_score, nb_gen=nb_gen)) else: scores.append(0.0) if VERBOSE == True: print("One result would be : {}, with proba {}".format(answers[-1], scores[-1])) remove_token = True else: if answer not in answers: answers.append(answer) if answers[-1] not in ["", "-"]: scores.append(float(output_score)) else: scores.append(0.0) if VERBOSE == True: print("Another result would be : {}, with proba {}".format(answers[-1], scores[-1])) answer = tmp_answer break # if probabillity is low (ignoring current token), or only forbiden tokens are proposed if output_score < 0.05 or nb_fails > 100: if answer not in answers: answers.append(answer) if answers[-1] not in ["", "-"]: scores.append(float(output_score)) else: scores.append(0.0) nb_fails = True if True: # VERBOSE == True: print( "No point to continue searching in ... : {}, with proba {}".format(answers[-1], scores[-1])) break # when predicting the first token, don't start multiple time at the same place if FIRST: FIRST = False if tmp_answer[0] in forbiden_first_token: if "nothing" not in VERBOSE: print("it started with the same letter") remove_token = True else: forbiden_first_token.append(tmp_answer[0]) # check that token is in input, and previous conditions are fullfulled (they would seet remove_token to True) if tmp_answer in context and not remove_token: answer = tmp_answer break # remove that token and search something else else: # remove that token as possibility nb_fails += 1 if "nothing" not in VERBOSE: print("this way the answer would not be in the context") print("remove token {}".format(int(next_token))) tmp = [[False for i in range(logistics_copy.shape[1])]] tmp[0][int(next_token)] = True logistics_copy = ge.set_tensor_by_indices_to_value( logistics_copy, tf.convert_to_tensor(tmp, dtype=tf.bool), -float("inf") ) probs = tf.nn.softmax(next_token_logits[0]) next_score = probs[int(next_token)] output_score = getNewProba(probaMode, output_score, next_score, nb_gen=nb_gen) if VERBOSE == True: print("output_score = {}, next_score = {}".format(output_score, next_score)) # print("The banned tokens are \n {}".format( # [tokenizer.decode([x]) for x in banned_tokens[0]])) # update generations and finished sentences if eos_token_id is not None: # pad finished sentences if eos_token_id exist tokens_to_add = next_token * unfinished_sents + (pad_token_id) * (1 - unfinished_sents) else: tokens_to_add = next_token # add token and increase length by one input_ids = tf.concat([input_ids, tf.expand_dims(tokens_to_add, -1)], 1) cur_len = cur_len + 1 if eos_token_id is not None: eos_in_sents = tokens_to_add == eos_token_id # if sentence is unfinished and the token to add is eos, sent_lengths is filled with current length is_sents_unfinished_and_token_to_add_is_eos = tf.math.multiply( unfinished_sents, tf.cast(eos_in_sents, tf.int32) ) sent_lengths = ( sent_lengths * (1 - is_sents_unfinished_and_token_to_add_is_eos) + cur_len * is_sents_unfinished_and_token_to_add_is_eos ) # unfinished_sents is set to zero if eos in sentence unfinished_sents -= is_sents_unfinished_and_token_to_add_is_eos # stop when there is a in each sentence, or if we exceed the maximum length if tf.math.reduce_max(unfinished_sents) == 0 or nb_fails == True: break # extend attention_mask for new generated input if only decoder if self.config.is_encoder_decoder is False: attention_mask = tf.concat( [attention_mask, tf.ones((ge.shape_list(attention_mask)[0], 1), dtype=tf.int32)], axis=-1 ) # if there are different sentences lengths in the batch, some batches have to be padded min_sent_length = tf.math.reduce_min(sent_lengths) max_sent_length = tf.math.reduce_max(sent_lengths) if min_sent_length != max_sent_length: assert pad_token_id is not None, "`Pad_token_id` has to be defined if batches have different lengths" # finished sents are filled with pad_token padding = tf.ones([batch_size, max_sent_length.numpy()], dtype=tf.int32) * pad_token_id # create length masks for tf.where operation broad_casted_sent_lengths = tf.broadcast_to( tf.expand_dims(sent_lengths, -1), [batch_size, max_sent_length] ) broad_casted_range = tf.transpose( tf.broadcast_to(tf.expand_dims(tf.range(max_sent_length), -1), [max_sent_length, batch_size]) ) decoded = tf.where(broad_casted_range < broad_casted_sent_lengths, input_ids, padding) else: decoded = input_ids if "score" in VERBOSE: print(answers) print(scores) if "" in answers: print(" -- ") if '' in answers: print(" ---- ") # find most likely option if len(scores) > 0: max_ind = scores.index(max(scores)) return [answers[max_ind], scores[max_ind]] else: return ["None", 0] return decoded print("def calc banned ...") def calc_banned_ngram_tokens_modif(prev_input_ids, num_hypos, no_repeat_ngram_size, cur_len): # Copied from fairseq for no_repeat_ngram in beam_search if cur_len + 1 < no_repeat_ngram_size: # return no banned tokens if we haven't generated no_repeat_ngram_size tokens yet return [[] for _ in range(num_hypos)] generated_ngrams = [{} for _ in range(num_hypos)] for idx in range(num_hypos): gen_tokens = prev_input_ids[idx].numpy().tolist() generated_ngram = generated_ngrams[idx] for ngram in zip(*[gen_tokens[i:] for i in range(no_repeat_ngram_size)]): prev_ngram_tuple = tuple(ngram[:-1]) generated_ngram[prev_ngram_tuple] = generated_ngram.get(prev_ngram_tuple, []) + [ngram[-1]] def _get_generated_ngrams(hypo_idx): # Before decoding the next token, prevent decoding of ngrams that have already appeared start_idx = cur_len + 1 - no_repeat_ngram_size ngram_idx = tuple(prev_input_ids[hypo_idx, start_idx:cur_len].numpy().tolist()) print("ngram_idx: {}".format(ngram_idx)) return generated_ngrams[hypo_idx].get(ngram_idx, []) banned_tokens = [_get_generated_ngrams(hypo_idx) for hypo_idx in range(num_hypos)] return banned_tokens