Skip to content
Snippets Groups Projects
Commit 98e0ca30 authored by DeepLearning VM's avatar DeepLearning VM
Browse files

Updated

parents
No related branches found
No related tags found
No related merge requests found
Showing
with 741 additions and 0 deletions
import sys
sys.path.append('/data/chuancen/pip_package')
sys.path.append('..')
import nltk
from nltk.translate.meteor_score import meteor_score
from utils import get_values_lexicon,values_lexicon_encode
from pytorch_transformers import GPT2Tokenizer
def main():
nltk.data.path.append('/data/chuancen/pip_package/nltk_data')
print(nltk.__version__)
file_handler = open('../../result/reference_SR_only.txt','r')
ref = file_handler.readlines()
file_handler = open('../../result/SR_only.txt','r')
hyp = file_handler.readlines()
print("#ref{} #hyp{}".format(len(ref),len(hyp)))
meteor_sum = 0
for i in range(min(len(ref),len(hyp))):
meteor_sum += meteor_score([ref[i]],hyp[i])
meteor_sum/=min(len(ref),len(hyp))
print(meteor_sum)
tokenizer = GPT2Tokenizer.from_pretrained('/data/chuancen/LIT/models/345M_Alex')
if __name__ == "__main__":
main()
\ No newline at end of file
This diff is collapsed.
from .load_data import GptDataset,collate_fn,collate_fn_nli,GptDataset_nli,SnliDataset,GptDataset_aug,GptDataset_keyword, collate_fn_keyword, get_data, prepare_mix_review, update_mix_review, XLDataset_nli
File added
File added
File added
File added
This diff is collapsed.
#!/usr/bin/env python3
import sys
# sys.path.insert(0,'/home/shensq/LIT/pip_package') # make sure the modified version of pytorch_transformer
import transformers
# assert pytorch_transformers.__file__[-36:]=='pip_package/transformers/__init__.py'
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import argparse
import logging
import pickle
import re
from tqdm import trange
import random
import torch
import torch.nn.functional as F
import numpy as np
from torch.utils.data import Dataset,DataLoader
from torch.autograd import Variable
from tqdm import tqdm, trange
from rouge import Rouge
from utils import clean_text,text_standardize,values_lexicon_encode
from gpt_loader import GptDataset, collate_fn, GptDataset_aug, GptDataset_keyword, collate_fn_keyword, get_data
# import nltk
# from nltk.translate.meteor_score import meteor_score
def top_k_logits(logits, k):
"""
Masks everything but the k top entries as -infinity (1e10).
Used to mask logits such that e^-infinity -> 0 won't contribute to the
sum of the denominator.
"""
if k == 0:
return logits
else:
values = torch.topk(logits, k)[0]
batch_mins = values[:, -1].view(-1, 1).expand_as(logits)
return torch.where(logits < batch_mins, torch.ones_like(logits) * -1e10, logits)
def get_topic_keywords(meta):
# TODO: temperary function
keywords_up = []
keywords_down = []
if meta[1]=='Weight management':
keywords_up += [6551, 4483, 2057, 9799, 4425, 4461, 4255, 5517]
keywords_down += [46040, 21856, 2526, 13230, 7523, 15220]
if meta[1]=='Smoking cessation':
keywords_up += [46040, 21856, 2526, 13230, 7523, 15220]
keywords_down += [6551, 4483, 2057, 9799, 4425, 4461, 4255, 5517]
return keywords_up, keywords_down
def top_k_top_p_filtering(logits, top_k=0, top_p=0.0, filter_value=-float('Inf')):
""" Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
Args:
logits: logits distribution shape (vocabulary size)
top_k > 0: keep only top k tokens with highest probability (top-k filtering).
top_p > 0.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering).
Nucleus filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751)
From: https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317
"""
assert logits.dim() == 1 # batch size 1 for now - could be updated for more but the code would be less clear
top_k = min(top_k, logits.size(-1)) # Safety check
if top_k > 0:
# Remove all tokens with a probability less than the last token of the top-k
indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
logits[indices_to_remove] = filter_value
if top_p > 0.0:
sorted_logits, sorted_indices = torch.sort(logits, descending=True)
cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
# Remove tokens with cumulative probability above the threshold
sorted_indices_to_remove = cumulative_probs > top_p
# Shift the indices to the right to keep also the first token above the threshold
sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
sorted_indices_to_remove[..., 0] = 0
indices_to_remove = sorted_indices[sorted_indices_to_remove]
logits[indices_to_remove] = filter_value
return logits
def sample_sequence(model, length, context, start_token=None, batch_size=1, modified_decoding=False,
value_word_relation=None, meta=None, key_word=None, num_samples=1, temperature=1,
top_k=0, top_p=0.0, device='cuda', use_keyword=None):
context = torch.tensor(context, dtype=torch.long, device=device)
context = context.unsqueeze(0).repeat(num_samples, 1)
generated = context
prev = context
past = None
with torch.no_grad():
for i in trange(length):
# inputs = {'input_ids': generated, 'past': None, 'key_word': key_word, 'use_keyword':use_keyword}
inputs = {'input_ids': generated, 'past': None}
logits, past = model(**inputs)
next_token_logits = logits[0, -1, :] / (temperature if temperature>0 else 1.)
filtered_logits = top_k_top_p_filtering(next_token_logits, top_k=top_k, top_p=top_p)
# if top_k > 0 or top_p > 0.0: # greedy, top_p, top_k
if temperature == 0:
next_token = torch.argmax(filtered_logits, dim=-1).unsqueeze(-1)
else: # temperature
next_token = torch.multinomial(F.softmax(filtered_logits, dim=-1), num_samples=1)
while (i == 0) and (next_token[0] == 50256):
next_token = torch.multinomial(F.softmax(filtered_logits, dim=-1), num_samples=1)
generated = torch.cat((generated, next_token.unsqueeze(0)), dim=1)
prev = next_token.unsqueeze(0)
if next_token[0] in [50256]:
break
return generated
def load_model_data(args):
# === prepare data and model
# ====== Load GPT2 model ========
model_dir = '../models/'+args.model_dir
model = GPT2LMHeadModel.from_pretrained(model_dir)
if USE_CUDA:
model.cuda()
tokenizer = GPT2Tokenizer.from_pretrained(model_dir)
return model, tokenizer
def run_model(args, model, tokenizer, test_loader):
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.eval()
if args.length == -1:
args.length = model.config.n_ctx // 2
elif args.length > model.config.n_ctx:
raise ValueError("Can't get samples longer than window size: %s" % model.config.n_ctx)
hyp = []
ref = []
context = []
f = open('../result/'+args.output_dir+'.txt','w')
f_ref = open('../result/reference_'+args.output_dir+'.txt','w')
for i,sample in enumerate(test_loader):
if args.cross_attention:
x, type_x, pos_x, lm_x, x_len, meta, keyword_x = sample
else:
x, type_x, pos_x, lm_x, x_len, meta = sample
keyword_x = None
input_len = x_len[0] # The number of tokens of the context utterances
context_tokens = x[0][:input_len+1] # at evaluation stage, the input is without the ground truth
generated = 0
for i in range(args.nsamples // args.batch_size):
decode_length = int(len(context_tokens))
# if args.augment:
# decode_length = int(0.5 * (5/6) * len(context_tokens))
out = sample_sequence(
model=model,length=decode_length,
context=context_tokens,
start_token=None,
batch_size=args.batch_size,
temperature=args.temperature, top_k=args.top_k, top_p=args.top_p, modified_decoding=args.modified_decoding,
value_word_relation=None,device=device,meta=meta[0][0], key_word=keyword_x, use_keyword= args.cross_attention
)
out = out[:, len(context_tokens):-1].tolist() # the generated result,get rid of eos
ref.append(tokenizer.decode(x[0].tolist()[len(context_tokens):-1]))
f_ref.write(tokenizer.decode(x[0].tolist()[len(context_tokens):-1]))
f_ref.write('\n')
hyp.append(tokenizer.decode(out[0]))
f.write(tokenizer.decode(out[0]))
f.write('\n')
context.append(tokenizer.decode(x[0].tolist()[:len(context_tokens)]))
f.close()
f_ref.close()
return hyp, ref, context
def calculate_metric(hyp, ref, context, effective_length=1024):
# ===== Calculate rouge ========
with open('../result/rouge.txt','a') as f_result:
rouge = Rouge()
print(len(hyp))
print(len(ref))
hyp, ref = zip(*[(x,y) for x,y in zip(hyp, ref) if len(x)>3 and len(y)>3])
print(len(hyp))
hyp = [x[:effective_length] for x in hyp]
ref = [x[:effective_length] for x in ref]
scores = rouge.get_scores(hyp, ref,avg=True)
print("ROUGE",scores)
import time
f_result.write(time.asctime()+'\n')
f_result.write(args.model_dir+ '\t' + str(effective_length) +'\n')
f_result.write(str(scores))
f_result.write('\n')
# == dump output====
print("#ref{} #hyp{}".format(len(ref),len(hyp)))
with open("../data_processed/output_" + args.model_dir+'p{}k{}'.format(args.top_p,args.top_k),'wb') as f_output:
pickle.dump(zip(hyp,ref,context), f_output)
# # ====== Calculate Meteor =========
# meteor_sum = 0
# for i in range(min(len(ref),len(hyp))):
# meteor_sum += meteor_score([ref[i]],hyp[i])
# meteor_sum/=min(len(ref),len(hyp))
# print(meteor_sum)
def rouge_rank(hyp, ref, context):
rouge = Rouge()
# import pdb;pdb.set_trace()
hyp, ref = zip(*[(x,y) for x,y in zip(hyp, ref) if len(x)>3 and len(y)>3])
scores = rouge.get_scores(hyp, ref,avg=False) # type: list
scores_content = zip(scores, hyp, ref, context, range(len(hyp)))
scores_content = sorted(scores_content, key=lambda x:x[0]['rouge-1']['f'], reverse=True)
return scores_content
if __name__ == '__main__':
USE_CUDA = torch.cuda.is_available()
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
datefmt = '%m/%d/%Y %H:%M:%S',
level = logging.INFO)
logger = logging.getLogger(__name__)
# Parse command line arguments
parser = argparse.ArgumentParser()
parser.add_argument('--model_dir', type=str, default='345M_Alex', help='pretrained model name or path to local checkpoint')
parser.add_argument("--seed", type=int, default=42)
parser.add_argument("--nsamples", type=int, default=1)
parser.add_argument("--batch_size", type=int, default=-1)
parser.add_argument("--length", type=int, default=64)
parser.add_argument("--temperature", type=float, default=1.0)
parser.add_argument("--top_k", type=int, default=0)
parser.add_argument("--top_p", type=float, default=0)
parser.add_argument('--output_dir',type=str,default='generate', help="The name of the output file.")
parser.add_argument('--modified_decoding', action='store_true')
parser.add_argument('--augment', action='store_true')
parser.add_argument('--special_input',type=str)
parser.add_argument('--keyword', action='store_true')
parser.add_argument('--cross_attention', action='store_true')
parser.add_argument('--num_turns', type=int, default=5)
args = parser.parse_args()
if args.batch_size == -1:
args.batch_size = 1
assert args.nsamples % args.batch_size == 0
print(args)
# Setup the random seeds.
np.random.seed(args.seed)
torch.random.manual_seed(args.seed)
torch.cuda.manual_seed(args.seed)
torch.manual_seed(args.seed)
model, tokenizer = load_model_data(args)
split_size = {'train': 0.85, 'test': 0.1, 'val': 0.05}
data_loader, test_loader, val_loader = get_data(args, split_size=split_size, tokenizer=tokenizer)
# model, tokenizer, test_loader = load_model_data(args) # TODO: this is for old get_data
# import pdb;pdb.set_trace()
hyp, ref, context = run_model(args, model, tokenizer, test_loader)
sample_ranked = rouge_rank(hyp, ref, context)
with open("../data_processed/rouge_rank_" + args.model_dir,'wb') as f:
pickle.dump(sample_ranked, f)
calculate_metric(hyp, ref, context)
# calculate_metric(hyp, ref, context, 5)
# Path to the pytorch checkpoint
# /Users/shensq/Documents/LIT_ai_counseling/gpt2/models/pytorch_345M'
import sys
# sys.path.insert(0, '/home/shensq/LIT/pip_package')
import re
import argparse
import torch
import pickle
import os
import transformers
from transformers import GPT2Tokenizer, GPT2Model, GPT2LMHeadModel, AdamW, WEIGHTS_NAME, CONFIG_NAME
from torch.utils.data import Dataset, DataLoader
from torch.autograd import Variable
from tqdm import tqdm, trange
import random
from utils import clean_text, text_standardize, construct_grouped_parameters, get_unfreezing_funcs
from gpt_loader import GptDataset, collate_fn, GptDataset_aug, GptDataset_keyword, collate_fn_keyword, prepare_mix_review, update_mix_review, get_data
# OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
import logging
def evaluate(model, data_loader, use_keyword=None):
"""
Evaluate the model on validation set.
:param model: The model being training.
:param data_loader: the data loader for validation set.
:param use_keyword: whether the input contains keyword or not.
:return: eval_loss: the average loss on the validation set.
"""
model.eval()
eval_loss = 0
for sample in tqdm(data_loader):
if use_keyword:
x, type_x, pos_x, lm_x, x_len, _, keyword_x = sample
else:
x, type_x, pos_x, lm_x, x_len, _ = sample
keyword_x = None
# loss = model(x, position_ids=pos_x, token_type_ids=type_x, labels=lm_x, key_word=keyword_x,
# use_keyword=use_keyword)[0]
loss = model(x, position_ids=pos_x, token_type_ids=type_x, labels=lm_x)[0]
eval_loss += loss.item()
eval_loss /= len(data_loader)
model.train()
return eval_loss
def parse_arguments():
"""
Parse command line argument using argparse.
:return args: A parser object with hyper-parameters' name and their values.
"""
parser = argparse.ArgumentParser()
parser.add_argument("--model_dir", default='345M_Alex', type=str, required=False,
help="The directory of the model to be tuned.")
parser.add_argument("--output_dir", default='mi_tuned', type=str, required=False,
help="The output directory where the model predictions and checkpoints will be written.")
parser.add_argument('--seed', type=int, default=42)
parser.add_argument('--num_train_epochs', type=int, default=1)
parser.add_argument('--train_batch_size', type=int, default=1)
parser.add_argument('--max_grad_norm', type=int, default=1)
parser.add_argument('--learning_rate', type=float, default=6.25e-5)
parser.add_argument('--warmup_proportion', type=float, default=0.1)
parser.add_argument('--lr_schedule', type=str, default='warmup_linear')
parser.add_argument('--weight_decay', type=float, default=0.01)
parser.add_argument('--lm_coef', type=float, default=0.9)
parser.add_argument('--n_valid', type=int, default=374)
parser.add_argument('--augment', action='store_true')
parser.add_argument('--keyword', action='store_true')
parser.add_argument('--cross_attention', action='store_true')
parser.add_argument('--special_input', type=str)
parser.add_argument('--first_K_tokens', type=int, default=1024)
parser.add_argument('--use_disc_lr', action='store_true')
parser.add_argument('--use_unfreezing', action='store_true')
parser.add_argument('--num_turns', type=int, default=5)
args = parser.parse_args()
print(args)
return args
def load_model(args):
"""
Load model and the corresponding tokenizer from pre-trained weight.
:param args: The command line arguments.
:return model: The main model.
:return tokenzier: The tokenzier comes with the main model.
"""
USE_CUDA = torch.cuda.is_available()
# ====== Load GPT2 model ========
model_dir = '../models/' + args.model_dir
# model = GPT2LMHeadModel.from_pretrained(model_dir)
model = GPT2LMHeadModel.from_pretrained('gpt2-medium')
if USE_CUDA:
model.cuda()
# tokenizer = GPT2Tokenizer.from_pretrained(model_dir)
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium')
print('Model loaded.')
return model, tokenizer
def main():
args = parse_arguments()
# ====== Set random seed =========
random.seed(args.seed)
torch.random.manual_seed(args.seed)
torch.cuda.manual_seed(args.seed)
torch.manual_seed(args.seed)
# ======= Prepare ==========
logging.basicConfig(level=logging.INFO)
USE_CUDA = torch.cuda.is_available()
FloatTensor = torch.cuda.FloatTensor if USE_CUDA else torch.FloatTensor
LongTensor = torch.cuda.LongTensor if USE_CUDA else torch.LongTensor
ByteTensor = torch.cuda.ByteTensor if USE_CUDA else torch.ByteTensor
model, tokenizer = load_model(args)
# =============== Load & process data ==============
split_size = {'train': 0.85, 'test': 0.1, 'val': 0.05}
data_loader, test_loader, val_loader = get_data(args, split_size=split_size, tokenizer=tokenizer)
# gpt_alex = prepare_mix_review(args, tokenizer)
# data_loader, val_loader = get_data(args, split_size=split_size, tokenizer=tokenizer) # TODO: this is for old get_data
# import pdb;pdb.set_trace()
# ========== Prepare optimizer =============
# the gpt2 model from library has unnamed LM head. LM head's weights are tied to input embedding
num_train_optimization_steps = len(data_loader) * args.num_train_epochs // args.train_batch_size
param_optimizer = list(model.named_parameters())
optimizer_grouped_parameters = construct_grouped_parameters(param_optimizer, args.learning_rate,
use_discr=args.use_disc_lr)
lm_funcs = get_unfreezing_funcs(optimizer_grouped_parameters, warmup_portion=args.warmup_proportion,
total_steps=num_train_optimization_steps, use_unfreezing=args.use_unfreezing)
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, correct_bias=False)
scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lm_funcs)
# Training
print("Start training.")
model.train()
exp_average_loss = None
progress_bar = trange(int(args.num_train_epochs), desc="Epoch", leave=True)
min_eval_loss = 100 # large enough number
early_terminate_counter = 0
for epo in progress_bar:
# for _ in range(int(args.num_train_epochs)):
# data_loader = update_mix_review(gpt_train, gpt_alex, epo, mix_ratio=4, mix_decay=0.7)
for sample in tqdm(data_loader):
# for sample in data_loader:
# import pdb;pdb.set_trace()
if args.cross_attention:
x, type_x, pos_x, lm_x, x_len, _, keyword_x = sample
else:
x, type_x, pos_x, lm_x, x_len, _ = sample
keyword_x = None
input_len = x_len[0]
lm_x[:, x_len[0] + 1 + args.first_K_tokens:-1] = -1
# loss = model(x, position_ids=pos_x, token_type_ids=type_x, labels=lm_x, key_word=keyword_x,
# use_keyword=args.cross_attention)[0]
loss = model(x, position_ids=pos_x, token_type_ids=type_x, labels=lm_x)[0]
loss.backward()
optimizer.step()
scheduler.step()
optimizer.zero_grad()
exp_average_loss = loss.item() if exp_average_loss is None else 0.7 * exp_average_loss + 0.3 * loss.item()
progress_bar.set_description("Training loss: {}".format(exp_average_loss))
eval_loss = evaluate(model, val_loader, use_keyword=args.cross_attention)
print("Eval loss: {}".format(eval_loss))
# if eval_loss < min_eval_loss: # save the model only when the loss is the smallest
if True:
early_terminate_counter = 0
min_eval_loss = eval_loss
# ==== Save the model ====
# Save a trained model, configuration and tokenizer
model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self
# If we save using the predefined names, we can load using `from_pretrained`
output_dir = '../models/'
output_model_file = os.path.join(output_dir + args.output_dir, WEIGHTS_NAME)
output_config_file = os.path.join(output_dir + args.output_dir, CONFIG_NAME)
torch.save(model_to_save.state_dict(), output_model_file)
model_to_save.config.to_json_file(output_config_file)
tokenizer.save_vocabulary(output_dir + args.output_dir)
else:
print("eval loss increasing!")
early_terminate_counter += 1
if early_terminate_counter > 5: # if the eval loss does not decrease for 5 epochs, terminate early.
return
if __name__ == '__main__':
main()
from .my_model import GPT2ClassHeadsModel
\ No newline at end of file
File added
File added
File added
File added
import pytorch_transformers
from pytorch_transformers import GPT2PreTrainedModel, GPT2Model, GPT2LMHeadModel,AdamW, WEIGHTS_NAME, CONFIG_NAME
import torch
import torch.nn as nn
from torch.nn import CrossEntropyLoss
class GPT2ClassHeadsModel(GPT2PreTrainedModel):
def __init__(self, config):
super(GPT2ClassHeadsModel, self).__init__(config)
self.transformer = GPT2Model(config)
self.classifier = nn.Linear(config.n_embd, 2)
# self.classifier = nn.Sequential(nn.Linear(config.n_embd, 768), nn.ReLU(), nn.Dropout(p=0.2),
# nn.Linear(768, 2))
# self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
self.init_weights()
def forward(self, input_ids, labels=None, token_type_ids=None,
position_ids=None, past=None, head_mask=None):
transformer_outputs = self.transformer(input_ids, position_ids=position_ids, token_type_ids=token_type_ids,
past=past, head_mask=head_mask)
hidden_states = transformer_outputs[0] # torch.Size([1, 124, 1024])
logits = self.classifier(hidden_states[:,-1,:]) # torch.Size([1,2])
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits,labels)
return loss,logits
# get tf-idf info for each documents
# get relevant sentences to be evaluated
# load the model
# calculate score for every candidates
# pick the one with highest score
# save the new input
import sys
sys.path.insert(0, '/home/shensq/LIT/pip_package')
import argparse
import glob
import re
import numpy as np
from tqdm import tqdm
import torch
from utils import text_standardize
from pytorch_transformers import GPT2Tokenizer
from gpt_loader import GptDataset, collate_fn, collate_fn_nli, GptDataset_nli, SnliDataset
from torch.utils.data import Dataset, DataLoader
from model import GPT2ClassHeadsModel
import pickle
import logging
def get_doc_utterance(files):
num_turns = 5
doc_utterances = [] # [doc][sen]
doc_responses = [] # [doc][sen]
code_set = set(['CR', 'SR', 'GIV', 'QUEST', 'SEEK', 'AF', 'EMPH', 'PWOP', 'PWP', 'CON'])
# =====get utterance& responses list (untokenized) ========
for file in files:
f = open(file)
data = []
for line in f:
line = line.split() # split only with space
data.append(line) # data[i] is a list of tokens in a sentence
data_utterance = []
data_response = []
for i, sen in enumerate(data):
if sen[0] == 'SR' or sen[0] == 'CR':
data_response.append(' '.join(sen[1:])) # untokenized response
if sen[0] not in code_set:
data_utterance.append(' '.join(sen[1:])) # skip line with code to avoid duplicate
doc_responses.append(data_response)
doc_utterances.append(data_utterance)
return doc_responses, doc_utterances
def clean_text(text):
text = text.lower()
text = re.sub("it's", "it is", text)
text = re.sub("i'm", "i am", text)
text = re.sub("he's", "he is", text)
text = re.sub("she's", "she is", text)
text = re.sub("that's", "that is", text)
text = re.sub("what's", "what is", text)
text = re.sub("where's", "where is", text)
text = re.sub("he's", "he is", text)
text = re.sub("\'s", " \'s", text)
text = re.sub("\'ll", " will", text)
text = re.sub("\'ve", " have", text)
text = re.sub("\'d", " would", text)
text = re.sub("\'re", " are", text)
text = re.sub("don't", "do not", text)
text = re.sub("won't", "will not", text)
text = re.sub("can't", "can not", text)
text = re.sub("[-()\"#/@;:<>{}+=~.…,|!?]", "", text)
return text
def get_tfidf(files, doc_utterances):
# ================ Get word2index & tokenized utterance list =============
word2index = {}
index2word = []
doc_utterances_tokenized = [] # [doc][sen] -> list of tokens
for doc_id, doc in enumerate(doc_utterances):
doc_tokenized = []
for sen in doc:
sen = clean_text(sen)
sen = sen.split()
doc_tokenized.append(sen)
for word in sen:
if word not in word2index:
word2index[word] = len(word2index)
index2word.append(word)
doc_utterances_tokenized.append(doc_tokenized)
doc_utterances_tokenized_flat = []
for doc in doc_utterances_tokenized:
doc_utterances_tokenized_flat.append([w for sen in doc for w in sen])
# ========= Get TF-IDF ============
tf = np.zeros([len(word2index), len(files)]) # [word][doc] -> term frequency
for i, doc in enumerate(doc_utterances_tokenized):
for sen in doc:
for word in sen:
tf[word2index[word], i] += 1
# Inverse document frequency, count how many documents does each word appear in.
idf = np.zeros(len(word2index)) # [doc] -> inverse document frequency
df = np.sum(np.where(tf != 0, np.ones(tf.shape), np.zeros(tf.shape)), axis=1)
idf = np.log10(len(word2index) / df).reshape(len(word2index), 1)
tf_idf = tf * idf
# normalize vector for each document
for i in range(tf_idf.shape[1]):
tf_idf[:, i] = tf_idf[:, i] / np.linalg.norm(tf_idf[:, i])
return tf_idf, tf, idf, word2index, index2word
def get_sentence_tfidf(x, word2index, idf):
x_concat = [] # tokens of k-utternaces
for sen in x:
sen = sen.lower()
sen = clean_text(sen)
sen = sen.split()
for w in sen:
x_concat.append(w)
query_tfidf = np.zeros(len(word2index))
for w in x_concat:
query_tfidf[word2index[w]] += 1
query_tfidf = query_tfidf.reshape(len(word2index), 1) * idf # (vocab,1)
return query_tfidf
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--model_dir", default='345M_Alex', type=str, required=False,
help="The directory of the model to be tuned.")
parser.add_argument('--seed', type=int, default=42)
parser.add_argument('--augment', action='store_true')
parser.add_argument('--keyword', action='store_true')
parser.add_argument('--special_input', type=str, default='x_y_meta')
parser.add_argument('--first_K_tokens', type=int, default=1024)
parser.add_argument('--num_turns', type=int, default=5)
args = parser.parse_args()
logging.basicConfig(level=logging.INFO)
filepath = '../data/datasetMI_real_standardized/annotations/'
files = glob.glob(filepath + '[0-9m]*.txt')
file_index = dict([(f[48:],i) for i,f in enumerate(files)])
model_dir = '../models/' + args.model_dir
model = GPT2ClassHeadsModel.from_pretrained(model_dir)
# model = GPT2ClassHeadsModel.from_pretrained('gpt2')
if torch.cuda.is_available():
model.cuda()
model.eval()
tokenizer = GPT2Tokenizer.from_pretrained(model_dir)
# tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
print('Model loaded.')
pickle_handler = open('../data_processed/x_y_meta_all_new', 'rb')
x_y_meta = pickle.load(pickle_handler)
# gpt_data = GptDataset_nli(x_y_meta, tokenizer, augment=False, num_turns=10)
gpt_data = GptDataset_nli(x_y_meta, tokenizer, args, infer=True)
doc_responses, doc_utterances = get_doc_utterance(files)
tf_idf, tf, idf, word2index, index2word = get_tfidf(files, doc_utterances)
# import pdb;pdb.set_trace()
x_y_meta_aug = []
for x, y, meta,_,_ in tqdm(x_y_meta):
query_tfidf = get_sentence_tfidf(x, word2index, idf)
doc_score = tf_idf.T.dot(query_tfidf).reshape(len(files))
doc_score[file_index[meta[0]]] = 0
top_k_idx = np.argsort(-doc_score)[0] # pick only one doc
response_candidates = doc_responses[top_k_idx]
candidate_score = []
candidates = list(zip([x] * len(response_candidates), response_candidates, [0] * len(response_candidates),
[""]* len(response_candidates), [""]* len(response_candidates)))
gpt_data.x_encoded, gpt_data.y_encoded, gpt_data.label,_,_ = gpt_data._split(candidates)
data_loader = DataLoader(dataset=gpt_data, batch_size=1, shuffle=False, drop_last=False,
collate_fn=collate_fn_nli)
for token_x, type_x, pos_x, lm_x, label in data_loader:
if token_x.shape[1] >= 512:
candidate_score.append(float('-inf'))
continue
loss, logits = model(token_x, position_ids=pos_x, token_type_ids=type_x, labels=label) # [batch,class]
# candidate_score.append(logits[:, 1].item()) # does not support batch
candidate_score.append(torch.softmax(logits, 1)[:, 1].item())
if len(candidate_score) > 0:
y_aug = response_candidates[np.argmax(candidate_score)]
x_y_meta_aug.append([x, y, meta, y_aug])
with open('../data_processed/x_y_meta_aug_new', 'wb') as f:
pickle.dump(x_y_meta_aug, f)
if __name__ == "__main__":
main()
#!/bin/bash
pwd
# python retrieve_candidate.py --model_dir mi_nli
mkdir -p ../models/mi_tuned_5turn
python gpt_tuning.py --output_dir mi_tuned_5turn --num_train_epochs 10 --num_turns 5
python gpt_sample.py --model_dir mi_tuned_5turn --output_dir mi_tuned_5turn --num_turns 5 --top_p 0.95
mkdir -p ../models/mi_tuned_aug
python gpt_tuning.py --output_dir mi_tuned_aug --num_train_epochs 10 --num_turns 5 --augment
python gpt_sample.py --model_dir mi_tuned_aug --output_dir mi_tuned_aug --num_turns 5 --augment --top_p 0.95
# mkdir -p ../models/mi_tuned_keyword
#python gpt_tuning.py --output_dir mi_tuned_keyword --num_train_epochs 10 --num_turns 5 --keyword
# python gpt_sample.py --model_dir mi_tuned_keyword --output_dir mi_tuned_keyword --num_turns 5 --keyword --top_p 0.95
# mkdir -p ../models/mi_tuned_both
# python gpt_tuning.py --output_dir mi_tuned_both --num_train_epochs 10 --num_turns 10 --keyword --augment
# python gpt_sample.py --model_dir mi_tuned_both --output_dir mi_tuned_both --num_turns 10 --keyword --augment --top_p 0.95
echo "Finished."
This diff is collapsed.
from .preprocessing import annotate_topic,parse_text,text_standardize,clean_text
from .lexicon import values_lexicon_encode
from .my_optim import construct_grouped_parameters, get_unfreezing_funcs
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment