Skip to content
Snippets Groups Projects
Commit 280fbb04 authored by DeepLearning VM's avatar DeepLearning VM
Browse files

server running code after kbert full implementation

parent c5662789
No related branches found
No related tags found
No related merge requests found
...@@ -543,14 +543,19 @@ class GptDataset_KBERT(Dataset): ...@@ -543,14 +543,19 @@ class GptDataset_KBERT(Dataset):
pickle_handler = open("../data_processed/data_comet_dict", 'rb') pickle_handler = open("../data_processed/data_comet_dict", 'rb')
self.data = pickle.load(pickle_handler) self.data = pickle.load(pickle_handler)
self.max_length = 510
self.tokenizer = tokenizer self.tokenizer = tokenizer
self.args = args self.args = args
self.num_turns = args.num_turns self.num_turns = args.num_turns
self.ref, self.speaker1, self.speaker2 = tokenizer.ref, tokenizer.speaker1, tokenizer.speaker2 self.ref, self.speaker1, self.speaker2 = tokenizer.ref, tokenizer.speaker1, tokenizer.speaker2
self.eos = tokenizer.eos self.eos = tokenizer.eos
self.augment = tokenizer.augment self.augment = tokenizer.augment
# self.args.kbert_mask = True
# self.args.kbert_position = True if not self.args.kbert:
self.args.kbert_mask = False
self.args.kbert_position = False
print("Not using kbert scheme.")
if self.args.kbert_mask: if self.args.kbert_mask:
print("using kbert-style attention mask") print("using kbert-style attention mask")
if self.args.kbert_position: if self.args.kbert_position:
...@@ -613,13 +618,14 @@ class GptDataset_KBERT(Dataset): ...@@ -613,13 +618,14 @@ class GptDataset_KBERT(Dataset):
last_related_token_index = len(srl_mask[i]) - 1 - srl_mask[i][::-1].index(1) last_related_token_index = len(srl_mask[i]) - 1 - srl_mask[i][::-1].index(1)
# add comet output # add comet output
if comet_encoded[i] is not None: if self.args.kbert:
x += [self.augment] + comet_encoded[i] if comet_encoded[i] is not None:
type_x += [self.augment] * (len(comet_encoded[i]) + 1) x += [self.augment] + comet_encoded[i]
type_x += [self.augment] * (len(comet_encoded[i]) + 1)
# +2 for the special token and the requirement of one-number larger than the utterance # +2 for the special token and the requirement of one-number larger than the utterance
soft_position_x += list(range(soft_loc + 2 + last_related_token_index, soft_position_x += list(range(soft_loc + 2 + last_related_token_index,
soft_loc + 2 + last_related_token_index + (len(comet_encoded[i]) + 1))) soft_loc + 2 + last_related_token_index + (len(comet_encoded[i]) + 1)))
soft_loc += (len(context_encoded[i]) + 1) soft_loc += (len(context_encoded[i]) + 1)
is_speaker1 = not is_speaker1 is_speaker1 = not is_speaker1
...@@ -634,7 +640,13 @@ class GptDataset_KBERT(Dataset): ...@@ -634,7 +640,13 @@ class GptDataset_KBERT(Dataset):
lm_x += [-100] + response_encoded + [self.eos] lm_x += [-100] + response_encoded + [self.eos]
soft_position_x += list(range(soft_loc, soft_loc + len(response_encoded) + 2)) soft_position_x += list(range(soft_loc, soft_loc + len(response_encoded) + 2))
x = x[:self.max_length]
type_x = type_x[:self.max_length]
lm_x = lm_x[:self.max_length]
soft_position_x = soft_position_x[:self.max_length]
# build attention mask # build attention mask
attention_mask = torch.tril(torch.ones(len(x), len(x))) attention_mask = torch.tril(torch.ones(len(x), len(x)))
if self.args.kbert_mask: if self.args.kbert_mask:
...@@ -691,11 +703,12 @@ def get_data(args, tokenizer, split_size): ...@@ -691,11 +703,12 @@ def get_data(args, tokenizer, split_size):
pickle_handler = open('../data_processed/' + args.special_input, 'rb') pickle_handler = open('../data_processed/' + args.special_input, 'rb')
x_y_meta = pickle.load(pickle_handler) x_y_meta = pickle.load(pickle_handler)
gpt_data = GptDataset(x_y_meta, tokenizer, args.output_dir, num_turns=args.num_turns) gpt_data = GptDataset(x_y_meta, tokenizer, args.output_dir, num_turns=args.num_turns)
elif not args.kbert: # #======================origin without kbert======
print("Using full data.") # elif not args.kbert:
pickle_handler = open('../data_processed/x_y_with_comet', 'rb') # TODO: change back to the old data. # print("Using full data.")
x_y_meta = pickle.load(pickle_handler) # pickle_handler = open('../data_processed/x_y_with_comet', 'rb') # TODO: change back to the old data.
gpt_data = GptDataset_full(x_y_meta, tokenizer, args=args) # x_y_meta = pickle.load(pickle_handler)
# gpt_data = GptDataset_full(x_y_meta, tokenizer, args=args)
else: else:
print("Using KBERT data") print("Using KBERT data")
gpt_data = GptDataset_KBERT(tokenizer, args=args) gpt_data = GptDataset_KBERT(tokenizer, args=args)
......
...@@ -92,7 +92,7 @@ def sample_sequence(model, length, context, num_samples=1, temperature=1, ...@@ -92,7 +92,7 @@ def sample_sequence(model, length, context, num_samples=1, temperature=1,
if torch.cuda.is_available(): if torch.cuda.is_available():
output_attention_mask = output_attention_mask.cuda() output_attention_mask = output_attention_mask.cuda()
with torch.no_grad(): with torch.no_grad():
for i in trange(length): for i in range(length):
# inputs = {'input_ids': generated, 'past': None, 'key_word': key_word, 'use_keyword':use_keyword} # inputs = {'input_ids': generated, 'past': None, 'key_word': key_word, 'use_keyword':use_keyword}
current_length = generated.shape[-1] current_length = generated.shape[-1]
if args.kbert: if args.kbert:
...@@ -144,22 +144,17 @@ def run_model(args, model, tokenizer, test_loader): ...@@ -144,22 +144,17 @@ def run_model(args, model, tokenizer, test_loader):
hyp = [] hyp = []
ref = [] ref = []
context = [] context = []
f = open('../result/'+args.output_dir+'.txt','w') # f = open('../result/'+args.output_dir+'.txt','w')
f_ref = open('../result/reference_'+args.output_dir+'.txt','w') # f_ref = open('../result/reference_'+args.output_dir+'.txt','w')
for i,sample in enumerate(test_loader): for sample in tqdm(test_loader):
# if args.cross_attention:
# x, type_x, pos_x, lm_x, x_len, meta, keyword_x = sample
# else:
# x, type_x, pos_x, lm_x, x_len, meta = sample
# keyword_x = None
x, type_x, pos_x, lm_x, x_len, attention_mask = sample x, type_x, pos_x, lm_x, x_len, attention_mask = sample
input_len = x_len[0] # The number of tokens of the context utterances input_len = x_len[0] # The number of tokens of the context utterances
context_tokens = x[0][:input_len+1] # at evaluation stage, the input is without the ground truth context_tokens = x[0][:input_len+1] # at evaluation stage, the input is without the ground truth
generated = 0 generated = 0
for i in range(args.nsamples // args.batch_size): for i in range(args.nsamples // args.batch_size):
decode_length = int(len(context_tokens)) decode_length = min(int(0.5 * len(context_tokens)),192)
# if args.augment: # if args.augment:
# decode_length = int(0.5 * (5/6) * len(context_tokens)) # decode_length = int(0.5 * (5/6) * len(context_tokens))
out = sample_sequence( out = sample_sequence(
...@@ -171,18 +166,31 @@ def run_model(args, model, tokenizer, test_loader): ...@@ -171,18 +166,31 @@ def run_model(args, model, tokenizer, test_loader):
out = out[:, len(context_tokens):-1].tolist() # the generated result,get rid of eos out = out[:, len(context_tokens):-1].tolist() # the generated result,get rid of eos
ref.append(tokenizer.decode(x[0].tolist()[len(context_tokens):-1])) ref.append(tokenizer.decode(x[0].tolist()[len(context_tokens):-1]))
f_ref.write(tokenizer.decode(x[0].tolist()[len(context_tokens):-1])) # f_ref.write(tokenizer.decode(x[0].tolist()[len(context_tokens):-1]))
f_ref.write('\n') # f_ref.write('\n')
hyp.append(tokenizer.decode(out[0])) hyp.append(tokenizer.decode(out[0]))
f.write(tokenizer.decode(out[0])) # f.write(tokenizer.decode(out[0]))
f.write('\n') # f.write('\n')
context.append(tokenizer.decode(x[0].tolist()[:len(context_tokens)])) context.append(tokenizer.decode(x[0].tolist()[:len(context_tokens)]))
f.close() # f.close()
f_ref.close() # f_ref.close()
return hyp, ref, context return hyp, ref, context
def print_metric(hyp, ref, context, effective_length=1024):
# ===== Calculate rouge ========
rouge = Rouge()
print(len(hyp))
print(len(ref))
hyp, ref = zip(*[(x,y) for x,y in zip(hyp, ref) if len(x)>3 and len(y)>3])
print(len(hyp))
hyp = [x[:effective_length] for x in hyp]
ref = [x[:effective_length] for x in ref]
scores = rouge.get_scores(hyp, ref,avg=True)
print("ROUGE",scores)
def calculate_metric(hyp, ref, context, effective_length=1024): def calculate_metric(hyp, ref, context, effective_length=1024):
# ===== Calculate rouge ======== # ===== Calculate rouge ========
with open('../result/rouge.txt','a') as f_result: with open('../result/rouge.txt','a') as f_result:
...@@ -222,6 +230,12 @@ def rouge_rank(hyp, ref, context): ...@@ -222,6 +230,12 @@ def rouge_rank(hyp, ref, context):
scores_content = sorted(scores_content, key=lambda x:x[0]['rouge-1']['f'], reverse=True) scores_content = sorted(scores_content, key=lambda x:x[0]['rouge-1']['f'], reverse=True)
return scores_content return scores_content
def set_seed(seed):
np.random.seed(seed)
torch.random.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.manual_seed(seed)
if __name__ == '__main__': if __name__ == '__main__':
USE_CUDA = torch.cuda.is_available() USE_CUDA = torch.cuda.is_available()
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
...@@ -256,17 +270,27 @@ if __name__ == '__main__': ...@@ -256,17 +270,27 @@ if __name__ == '__main__':
print(args) print(args)
# Setup the random seeds. # Setup the random seeds.
np.random.seed(args.seed) set_seed(args.seed)
torch.random.manual_seed(args.seed)
torch.cuda.manual_seed(args.seed)
torch.manual_seed(args.seed)
model, tokenizer = load_model_data(args) model, tokenizer = load_model_data(args)
split_size = {'train': 0.90, 'test': 0.05, 'val': 0.05} split_size = {'train': 0.90, 'test': 0.05, 'val': 0.05}
data_loader, test_loader, val_loader = get_data(args, split_size=split_size, tokenizer=tokenizer) data_loader, test_loader, val_loader = get_data(args, split_size=split_size, tokenizer=tokenizer)
# model, tokenizer, test_loader = load_model_data(args) # TODO: this is for old get_data # model, tokenizer, test_loader = load_model_data(args) # TODO: this is for old get_data
# import pdb;pdb.set_trace()
hyp, ref, context = run_model(args, model, tokenizer, test_loader) # seed_list = [0,10,]
seed_list = [20,30]
# seed_list = [0,]
hyp_all = []
ref_all = []
context_all = []
for seed in seed_list:
set_seed(seed)
print("Using random seed {}".format(seed))
hyp, ref, context = run_model(args, model, tokenizer, test_loader)
hyp_all += hyp
ref_all += ref
context_all += context
sample_ranked = rouge_rank(hyp, ref, context) sample_ranked = rouge_rank(hyp, ref, context)
with open("../data_processed/rouge_rank_" + args.model_dir,'wb') as f: with open("../data_processed/rouge_rank_" + args.model_dir,'wb') as f:
pickle.dump(sample_ranked, f) pickle.dump(sample_ranked, f)
......
...@@ -17,7 +17,7 @@ from tqdm import tqdm, trange ...@@ -17,7 +17,7 @@ from tqdm import tqdm, trange
import random import random
from utils import clean_text, text_standardize, construct_grouped_parameters, get_unfreezing_funcs from utils import clean_text, text_standardize, construct_grouped_parameters, get_unfreezing_funcs
from gpt_loader import GptDataset, collate_fn,collate_fn_keyword, prepare_mix_review, update_mix_review, get_data from gpt_loader import GptDataset, collate_fn,collate_fn_keyword, prepare_mix_review, update_mix_review, get_data
import gpt_sample
# OPTIONAL: if you want to have more information on what's happening, activate the logger as follows # OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
import logging import logging
...@@ -60,7 +60,7 @@ def parse_arguments(): ...@@ -60,7 +60,7 @@ def parse_arguments():
help="The output directory where the model predictions and checkpoints will be written.") help="The output directory where the model predictions and checkpoints will be written.")
parser.add_argument('--seed', type=int, default=42) parser.add_argument('--seed', type=int, default=42)
parser.add_argument('--num_train_epochs', type=int, default=1) parser.add_argument('--num_train_epochs', type=int, default=1)
parser.add_argument('--train_batch_size', type=int, default=2) parser.add_argument('--train_batch_size', type=int, default=1)
parser.add_argument('--max_grad_norm', type=int, default=1) parser.add_argument('--max_grad_norm', type=int, default=1)
parser.add_argument('--learning_rate', type=float, default=6.25e-5) parser.add_argument('--learning_rate', type=float, default=6.25e-5)
parser.add_argument('--warmup_proportion', type=float, default=0.1) parser.add_argument('--warmup_proportion', type=float, default=0.1)
...@@ -79,6 +79,7 @@ def parse_arguments(): ...@@ -79,6 +79,7 @@ def parse_arguments():
parser.add_argument('--kbert', action='store_true') parser.add_argument('--kbert', action='store_true')
parser.add_argument('--kbert_mask', action='store_true') parser.add_argument('--kbert_mask', action='store_true')
parser.add_argument('--kbert_position', action='store_true') parser.add_argument('--kbert_position', action='store_true')
parser.add_argument('--eval_rouge', action='store_true')
args = parser.parse_args() args = parser.parse_args()
print(args) print(args)
return args return args
...@@ -94,12 +95,13 @@ def load_model(args): ...@@ -94,12 +95,13 @@ def load_model(args):
# ====== Load GPT2 model ======== # ====== Load GPT2 model ========
model_dir = '../models/' + args.model_dir model_dir = '../models/' + args.model_dir
# model = GPT2LMHeadModel.from_pretrained(model_dir) # model = GPT2LMHeadModel.from_pretrained(model_dir)
model = GPT2LMHeadModel.from_pretrained('gpt2') model = GPT2LMHeadModel.from_pretrained('gpt2-medium')
# model = GPT2LMHeadModel.from_pretrained('gpt2')
if USE_CUDA: if USE_CUDA:
model.cuda() model.cuda()
# tokenizer = GPT2Tokenizer.from_pretrained(model_dir) # tokenizer = GPT2Tokenizer.from_pretrained(model_dir)
tokenizer = GPT2Tokenizer.from_pretrained('gpt2') tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium')
# tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
num_added_toks = tokenizer.add_tokens(['<speaker1>', '<speaker2>', '<augment>', '<ref>']) num_added_toks = tokenizer.add_tokens(['<speaker1>', '<speaker2>', '<augment>', '<ref>'])
model.resize_token_embeddings(len(tokenizer)) model.resize_token_embeddings(len(tokenizer))
tokenizer.eos = 50256 tokenizer.eos = 50256
...@@ -151,26 +153,19 @@ def main(): ...@@ -151,26 +153,19 @@ def main():
model.train() model.train()
exp_average_loss = None exp_average_loss = None
progress_bar = trange(int(args.num_train_epochs), desc="Epoch", leave=True) progress_bar = trange(int(args.num_train_epochs), desc="Epoch", leave=True)
min_eval_loss = 100 # large enough number prev_eval_loss = 100 # large enough number
early_terminate_counter = 0 early_terminate_counter = 0
for epo in progress_bar: for epo in progress_bar:
# for _ in range(int(args.num_train_epochs)): # for _ in range(int(args.num_train_epochs)):
# data_loader = update_mix_review(gpt_train, gpt_alex, epo, mix_ratio=4, mix_decay=0.7) # data_loader = update_mix_review(gpt_train, gpt_alex, epo, mix_ratio=4, mix_decay=0.7)
for sample in tqdm(data_loader): for sample in tqdm(data_loader):
# for sample in data_loader:
# import pdb;pdb.set_trace()
# if args.cross_attention:
# x, type_x, pos_x, lm_x, x_len, _, keyword_x = sample
# else:
# x, type_x, pos_x, lm_x, x_len, _ = sample
# keyword_x = None
x, type_x, pos_x, lm_x, x_len, attention_mask = sample x, type_x, pos_x, lm_x, x_len, attention_mask = sample
if not args.kbert: if not args.kbert:
attention_mask = None attention_mask = None
input_len = x_len[0] input_len = x_len[0]
lm_x[:, x_len[0] + 1 + args.first_K_tokens:-1] = -1 lm_x[:, x_len[0] + 1 + args.first_K_tokens:-1] = -1
# loss = model(x, position_ids=pos_x, token_type_ids=type_x, labels=lm_x, key_word=keyword_x,
# use_keyword=args.cross_attention)[0]
loss = model(x, position_ids=pos_x, token_type_ids=type_x, labels=lm_x, attention_mask=attention_mask)[0] loss = model(x, position_ids=pos_x, token_type_ids=type_x, labels=lm_x, attention_mask=attention_mask)[0]
loss.backward() loss.backward()
optimizer.step() optimizer.step()
...@@ -181,10 +176,12 @@ def main(): ...@@ -181,10 +176,12 @@ def main():
eval_loss = evaluate(model, val_loader, use_keyword=args.cross_attention) eval_loss = evaluate(model, val_loader, use_keyword=args.cross_attention)
print("Eval loss: {}".format(eval_loss)) print("Eval loss: {}".format(eval_loss))
if eval_loss < min_eval_loss: # save the model only when the loss is the smallest
if eval_loss < prev_eval_loss: # save the model only when the loss is the smallest
#if True: #if True:
early_terminate_counter = 0 early_terminate_counter = 0
min_eval_loss = eval_loss prev_eval_loss = eval_loss
# # ==== Save the model ==== # # ==== Save the model ====
# # Save a trained model, configuration and tokenizer # # Save a trained model, configuration and tokenizer
...@@ -200,10 +197,23 @@ def main(): ...@@ -200,10 +197,23 @@ def main():
model.save_pretrained(output_dir + args.output_dir) model.save_pretrained(output_dir + args.output_dir)
tokenizer.save_pretrained(output_dir + args.output_dir) tokenizer.save_pretrained(output_dir + args.output_dir)
else: else:
prev_eval_loss = eval_loss
print("eval loss increasing!") print("eval loss increasing!")
early_terminate_counter += 1 early_terminate_counter += 1
if early_terminate_counter > 3: # if the eval loss does not decrease for 5 epochs, terminate early. if early_terminate_counter >= 2: # if the eval loss does not decrease for 5 epochs, terminate early.
print('='*30+str(epo)+'='*30)
return return
if args.eval_rouge:
args.nsamples = 1
args.length = -1
args.batch_size = 1
args.temperature = 1.0
args.top_k = 0
args.top_p = 0.95
hyp, ref, context = gpt_sample.run_model(args, model, tokenizer, val_loader)
gpt_sample.print_metric(hyp, ref, context)
model.train()
if __name__ == '__main__': if __name__ == '__main__':
main() main()
#!/bin/bash #!/bin/bash
pwd pwd
NUM_EPOCHS=5 NUM_EPOCHS=10
NUM_TURNS=5 NUM_TURNS=5
MODEL_PATH="kbert" MODEL_PATH="no_kbert_"${NUM_EPOCHS}
mkdir -p ../models/${MODEL_PATH} mkdir -p ../models/${MODEL_PATH}
python gpt_tuning.py --output_dir ${MODEL_PATH} --num_train_epochs ${NUM_EPOCHS} --num_turns ${NUM_TURNS} --kbert # python gpt_tuning.py --output_dir ${MODEL_PATH} --num_train_epochs ${NUM_EPOCHS} --num_turns ${NUM_TURNS}
python gpt_sample.py --model_dir ${MODEL_PATH} --output_dir ${MODEL_PATH} --num_turns ${NUM_TURNS} --top_p 0.95 --kbert python gpt_sample.py --model_dir ${MODEL_PATH} --output_dir ${MODEL_PATH} --num_turns ${NUM_TURNS} --top_p 0.95
MODEL_PATH="kbert_mask_position_"${NUM_EPOCHS}
mkdir -p ../models/${MODEL_PATH}
# python gpt_tuning.py --output_dir ${MODEL_PATH} --num_train_epochs ${NUM_EPOCHS} --num_turns ${NUM_TURNS} --kbert --kbert_position --kbert_mask --eval_rouge
# python gpt_tuning.py --output_dir ${MODEL_PATH} --num_train_epochs ${NUM_EPOCHS} --num_turns ${NUM_TURNS} --kbert --kbert_position --kbert_mask
python gpt_sample.py --model_dir ${MODEL_PATH} --output_dir ${MODEL_PATH} --num_turns ${NUM_TURNS} --top_p 0.95 --kbert --kbert_position --kbert_mask
MODEL_PATH="kbert_position" MODEL_PATH="kbert_position_"${NUM_EPOCHS}
mkdir -p ../models/${MODEL_PATH} mkdir -p ../models/${MODEL_PATH}
python gpt_tuning.py --output_dir ${MODEL_PATH} --num_train_epochs ${NUM_EPOCHS} --num_turns ${NUM_TURNS} --kbert --kbert_position # python gpt_tuning.py --output_dir ${MODEL_PATH} --num_train_epochs ${NUM_EPOCHS} --num_turns ${NUM_TURNS} --kbert --kbert_position
python gpt_sample.py --model_dir ${MODEL_PATH} --output_dir ${MODEL_PATH} --num_turns ${NUM_TURNS} --top_p 0.95 --kbert --kbert_position python gpt_sample.py --model_dir ${MODEL_PATH} --output_dir ${MODEL_PATH} --num_turns ${NUM_TURNS} --top_p 0.95 --kbert --kbert_position
MODEL_PATH="kbert_mask_position" MODEL_PATH="kbert_"${NUM_EPOCHS}
mkdir -p ../models/${MODEL_PATH} mkdir -p ../models/${MODEL_PATH}
python gpt_tuning.py --output_dir ${MODEL_PATH} --num_train_epochs ${NUM_EPOCHS} --num_turns ${NUM_TURNS} --kbert --kbert_position --kbert_mask # python gpt_tuning.py --output_dir ${MODEL_PATH} --num_train_epochs ${NUM_EPOCHS} --num_turns ${NUM_TURNS} --kbert
python gpt_sample.py --model_dir ${MODEL_PATH} --output_dir ${MODEL_PATH} --num_turns ${NUM_TURNS} --top_p 0.95 --kbert --kbert_position --kbert_mask python gpt_sample.py --model_dir ${MODEL_PATH} --output_dir ${MODEL_PATH} --num_turns ${NUM_TURNS} --top_p 0.95 --kbert
#echo "Finished." echo "Finished."
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment