Commit d31317a4 authored by katabanga's avatar katabanga
Browse files

initial commit

parents
File added
input folder contains the dataset.
output_pic folder contains the output pictures.
output_txt folder contains the output text files, which are accuracies of different methods.
base_model.py contains code for preprocessing dataset and selecting base model.
al_rs.py contains code for the process of active learning and random sampling and outputs accuracies to output_txt.
plot.py plots out the accuracies for different methods obtained from output_txt.
import pandas as pd
import warnings as wrn
import numpy as np
import random
import matplotlib.pyplot as plt
wrn.filterwarnings("ignore")
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from functools import partial
from typing import Callable, Optional, Tuple, Union
from modAL.uncertainty import classifier_margin, classifier_entropy
from modAL.models import ActiveLearner
from modAL.batch import ranked_batch, uncertainty_batch_sampling
from modAL.models.base import BaseCommittee, BaseLearner
import scipy.sparse as sp
def margin_batch_sampling(classifier: Union[BaseLearner, BaseCommittee],
X: Union[np.ndarray, sp.csr_matrix],
n_instances: int = 20,
metric: Union[str, Callable] = 'euclidean',
n_jobs: Optional[int] = None,
**uncertainty_measure_kwargs
) -> Tuple[np.ndarray, Union[np.ndarray, sp.csr_matrix]]:
uncertainty = classifier_margin(classifier, X, **uncertainty_measure_kwargs)
query_indices = ranked_batch(classifier, unlabeled=X, uncertainty_scores=uncertainty,
n_instances=n_instances, metric=metric, n_jobs=n_jobs)
return query_indices, X[query_indices]
def entropy_batch_sampling(classifier: Union[BaseLearner, BaseCommittee],
X: Union[np.ndarray, sp.csr_matrix],
n_instances: int = 20,
metric: Union[str, Callable] = 'euclidean',
n_jobs: Optional[int] = None,
**uncertainty_measure_kwargs
) -> Tuple[np.ndarray, Union[np.ndarray, sp.csr_matrix]]:
uncertainty = classifier_entropy(classifier, X, **uncertainty_measure_kwargs)
query_indices = ranked_batch(classifier, unlabeled=X, uncertainty_scores=uncertainty,
n_instances=n_instances, metric=metric, n_jobs=n_jobs)
return query_indices, X[query_indices]
'''Preprocess data'''
#read in dataset
dataset = pd.read_csv('./input/dataset.csv')
X = dataset["Text"]
le = LabelEncoder()
y = le.fit_transform(dataset["language"])
# uni gram
uni_vector = CountVectorizer(strip_accents='unicode', analyzer='char', ngram_range=(1,1), max_features=800)
bag_of_words_uni = uni_vector.fit_transform(X)
uni_feature_names = uni_vector.get_feature_names()
uni_train_features=pd.DataFrame(bag_of_words_uni.toarray(), columns=uni_feature_names)
# partition into train and test data
X_TRAIN, X_TEST, Y_TRAIN, Y_TEST = train_test_split(bag_of_words_uni.toarray(), y, test_size=0.1, random_state=42)
# initial the model with random data
X_raw = X_TRAIN
y_raw = Y_TRAIN
training_indices = random.sample(range(0, X_raw.shape[0]), 600)
X_train = X_raw[training_indices]
y_train = y_raw[training_indices]
# cross validation
kf = KFold()
clf = MultinomialNB()
scores = []
for train_index, test_index in kf.split(X_train):
X_tr, X_t = X_train[train_index], X_train[test_index]
y_tr, y_t = y_train[train_index], y_train[test_index]
clf.fit(X_tr, y_tr)
predict = clf.predict(X_t)
acc = accuracy_score(y_t,predict)
scores.append(acc)
# print("accuracy:",np.mean(scores))
test_predict = clf.predict(X_TEST)
test_accuracy = accuracy_score(Y_TEST,test_predict)
# print("test accuracy:",test_accuracy)
with open('./output_txt/char_entropy.txt','w') as f:
f.write(str(test_accuracy)+'\n')
'''Random sampling'''
# initialize data for random sampling
X_cur, y_cur = X_train, y_train
X_temp, y_temp = np.delete(X_raw, training_indices, axis=0), np.delete(y_raw, training_indices, axis=0)
# 32 epochs
for index in range(32):
# pick next batch of data randomly
tr_indices = random.sample(range(0, X_temp.shape[0]), 600)
X_tr, y_tr = X_temp[tr_indices], y_temp[tr_indices]
X_temp, y_temp = np.delete(X_temp, tr_indices, axis=0),np.delete(y_temp, tr_indices, axis=0)
X_cur, y_cur = np.concatenate((X_cur,X_tr)), np.concatenate((y_cur,y_tr))
# cross validation
kf = KFold()
clf = MultinomialNB()
scores = []
for train_index, test_index in kf.split(X_cur):
X_tr, X_t = X_cur[train_index], X_cur[test_index]
y_tr, y_t = y_cur[train_index], y_cur[test_index]
clf.fit(X_tr, y_tr)
predict = clf.predict(X_t)
acc = accuracy_score(y_t,predict)
scores.append(acc)
# print("accuracy:",np.mean(scores))
test_predict = clf.predict(X_TEST)
test_accuracy = accuracy_score(Y_TEST,test_predict)
with open('./output_txt/char_random.txt','a') as f:
f.write(str(test_accuracy)+'\n')
# print("test accuracy:",test_accuracy)
'''Active Learning Process'''
# initialize active learner
BATCH_SIZE = 600
# switch uncertainty_batch_sampling, margin_batch_sampling, and entropy_batch_sampling
preset_batch = partial(uncertainty_batch_sampling, n_instances=BATCH_SIZE)
learner = ActiveLearner(
estimator=MultinomialNB(),
X_training=X_train,
y_training=y_train,
query_strategy=preset_batch
)
# initialize data for active learning
X_cur, y_cur = X_train, y_train
X_pool, y_pool = np.delete(X_raw, training_indices, axis=0), np.delete(y_raw, training_indices, axis=0)
# 32 epochs
for index in range(32):
# pick next batch of data through active learner
query_index, query_instance = learner.query(X_pool)
X, y = X_pool[query_index], y_pool[query_index]
X_cur, y_cur = np.concatenate((X_cur,X)), np.concatenate((y_cur,y))
X_pool, y_pool = np.delete(X_pool, query_index, axis=0), np.delete(y_pool, query_index)
# cross validation
clf = MultinomialNB()
kf = KFold()
scores = []
for train_index, test_index in kf.split(X_cur):
X_tr, X_t = X_cur[train_index], X_cur[test_index]
y_tr, y_t = y_cur[train_index], y_cur[test_index]
clf.fit(X_tr, y_tr)
predict = clf.predict(X_t)
acc = accuracy_score(y_t,predict)
scores.append(acc)
test_predict = clf.predict(X_TEST)
test_accuracy = accuracy_score(Y_TEST,test_predict)
with open('./output_txt/char_entropy.txt','a') as f:
f.write(str(test_accuracy)+'\n')
print(test_accuracy)
learner.teach(X=X, y=y)
# print('Accuracy after query {n}: {acc:0.4f}'.format(n=index + 1, acc=np.mean(scores)))
# print('Std after query {n}: {acc:0.4f}'.format(n=index + 1, acc=np.std(scores)))
# print('test acc after query {n}: {acc:0.4f}'.format(n=index + 1, acc=test_accuracy))
from mpl_toolkits.mplot3d import Axes3D
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt # plotting
import numpy as np
import seaborn as sns
import pandas as pd
import time
import warnings as wrn
wrn.filterwarnings("ignore")
from sklearn.preprocessing import LabelEncoder, label_binarize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import f1_score,accuracy_score,auc,roc_curve
from sklearn.multiclass import OneVsRestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
# read in data
dataset=pd.read_csv("./input/dataset.csv")
X,y = dataset["Text"],dataset["language"]
# plot distribution
sns.countplot(y)
plt.xlabel("Languages")
plt.ylabel("Number of Samples")
plt.xticks(rotation = 60)
plt.title('Distribution of Language Samples')
plt.savefig("./output_pic/languagedistribution.png", bbox_inches = "tight")
le = LabelEncoder()
y = le.fit_transform(y)
# print('Total class count:',np.unique(y).shape[0]) # has 22 unique class
# le_name_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
# print('Label mapping:',le_name_mapping)
# print('Frequency for each class:')
# unique, counts = np.unique(y, return_counts=True)
# print(np.asarray((unique, counts)).T) # each class has 1000 examples
# n-gram selection
char_list = []
word_list = []
char1_vectorizer = CountVectorizer(analyzer='char', ngram_range=(1,1), max_features=800)
char1 = char1_vectorizer.fit_transform(X)
char1_feature_names = char1_vectorizer.get_feature_names()
char_list.append(pd.DataFrame(char1.toarray(), columns=char1_feature_names))
char2_vectorizer = CountVectorizer(analyzer='char', ngram_range=(2,2), max_features=800)
char2 = char2_vectorizer.fit_transform(X)
char2_feature_names = char2_vectorizer.get_feature_names()
char_list.append(pd.DataFrame(char2.toarray(), columns=char2_feature_names))
char3_vectorizer = CountVectorizer(analyzer='char', ngram_range=(3,3), max_features=800)
char3 = char3_vectorizer.fit_transform(X)
char3_feature_names = char3_vectorizer.get_feature_names()
char_list.append(pd.DataFrame(char3.toarray(), columns=char3_feature_names))
char4_vectorizer = CountVectorizer(analyzer='char', ngram_range=(4,4), max_features=800)
char4 = char4_vectorizer.fit_transform(X)
char4_feature_names = char4_vectorizer.get_feature_names()
char_list.append(pd.DataFrame(char4.toarray(), columns=char4_feature_names))
char5_vectorizer = CountVectorizer(analyzer='char', ngram_range=(5,5), max_features=800)
char5 = char5_vectorizer.fit_transform(X)
char5_feature_names = char5_vectorizer.get_feature_names()
char_list.append(pd.DataFrame(char5.toarray(), columns=char5_feature_names))
word1_vectorizer = CountVectorizer(analyzer='word', ngram_range=(1,1), max_features=800)
word1 = word1_vectorizer.fit_transform(X)
word1_feature_names = word1_vectorizer.get_feature_names()
word_list.append(pd.DataFrame(word1.toarray(), columns=word1_feature_names))
word2_vectorizer = CountVectorizer(analyzer='word', ngram_range=(2,2), max_features=800)
word2 = word2_vectorizer.fit_transform(X)
word2_feature_names = word2_vectorizer.get_feature_names()
word_list.append(pd.DataFrame(word2.toarray(), columns=word2_feature_names))
word3_vectorizer = CountVectorizer(analyzer='word', ngram_range=(3,3), max_features=800)
word3 = word3_vectorizer.fit_transform(X)
word3_feature_names = word3_vectorizer.get_feature_names()
word_list.append(pd.DataFrame(word3.toarray(), columns=word3_feature_names))
word4_vectorizer = CountVectorizer(analyzer='word', ngram_range=(4,4), max_features=800)
word4 = word4_vectorizer.fit_transform(X)
word4_feature_names = word4_vectorizer.get_feature_names()
word_list.append(pd.DataFrame(word4.toarray(), columns=word4_feature_names))
word5_vectorizer = CountVectorizer(analyzer='word', ngram_range=(5,5), max_features=800)
word5 = word5_vectorizer.fit_transform(X)
word5_feature_names = word5_vectorizer.get_feature_names()
word_list.append(pd.DataFrame(word5.toarray(), columns=word5_feature_names))
char_accuracy = []
char_f1 = []
word_accuracy = []
word_f1 = []
n_gram = [1,2,3,4,5]
for X_features in char_list:
X_tr, X_t, y_tr, y_t = train_test_split(X_features, y, test_size=0.2)
clf = LogisticRegression()
clf.fit(X_tr, y_tr)
predict = clf.predict(X_t)
acc = accuracy_score(y_t,predict)
char_accuracy.append(acc)
f1 = f1_score(y_t,predict,average='weighted')
char_f1.append(f1)
for X_features in word_list:
X_tr, X_t, y_tr, y_t = train_test_split(X_features, y, test_size=0.2)
clf = LogisticRegression()
clf.fit(X_tr, y_tr)
predict = clf.predict(X_t)
acc = accuracy_score(y_t,predict)
word_accuracy.append(acc)
f1 = f1_score(y_t,predict,average='weighted')
word_f1.append(f1)
plt.clf()
plt.plot(n_gram, char_accuracy, label = 'Accuracy for Char N-gram')
plt.plot(n_gram, char_f1, label = 'F-1 score for Char N-gram')
plt.plot(n_gram, word_accuracy, label = 'Accuracy for Word N-gram')
plt.plot(n_gram, word_f1, label = 'F-1 score for Word N-gram')
plt.xticks(n_gram,[1,2,3,4,5])
plt.xlabel('N selection for N-gram')
plt.ylabel('Accuracy/F-1 Score')
plt.legend(loc='lower left')
plt.title('Comparison of Different N-gram Approaches')
plt.savefig('./output_pic/ngram.png')
# select base model
names = ["Nearest Neighbors", "Linear SVM", "Decision Tree", "Random Forest", "Logistic Regression", "AdaBoost",
"Naive Bayes", "QDA"]
classifiers = [
KNeighborsClassifier(5),
SVC(kernel="linear"),
DecisionTreeClassifier(max_depth=30),
RandomForestClassifier(max_depth=30),
LogisticRegression(),
AdaBoostClassifier(),
MultinomialNB(),
QuadraticDiscriminantAnalysis()]
kf = KFold()
clf_acc = []
clf_time = []
for name, clf in zip(names, classifiers):
times = []
scores = []
for train_index, test_index in kf.split(char_list[0]):
X_tr, X_t = char_list[0].iloc[train_index,:], char_list[0].iloc[test_index,:]
y_tr, y_t = y[train_index], y[test_index]
start_time = time.time()
clf.fit(X_tr, y_tr)
end_time = time.time()
process_time = round(end_time-start_time,2)
times.append(process_time)
predict = clf.predict(X_t)
acc = accuracy_score(y_t,predict)
scores.append(acc)
clf_acc.append(np.mean(scores))
clf_time.append(np.mean(times))
plt.clf()
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15,5))
clfs = [0,1,2,3,4,5,6,7]
ax1.plot(clfs, clf_acc)
ax1.set_xticks(range(8))
ax1.set_xticklabels(names,rotation=30)
ax1.set_xlabel('Classifers')
ax1.set_ylabel('Mean Accuracy for 5-fold')
ax1.set_title('Comparison of Performance of Classifiers')
ax2.bar(clfs,clf_time)
ax2.set_xticks(range(8))
ax2.set_xticklabels(names,rotation=30)
ax2.set_xlabel('Classifers')
ax2.set_ylabel('Mean Fitting Time for 5-fold(seconds)')
ax2.set_title('Comparison of Fitting Time of Classifiers')
fig.savefig('./output_pic/basemodel.png', bbox_inches = "tight")
# plot roc curve
'''plotting roc for logistic regression, random forest classification'''
y_bin = label_binarize(y, classes=np.arange(22))
clf_nb = OneVsRestClassifier(MultinomialNB())
nb_scores = []
kf = KFold()
for train_index, test_index in kf.split(char_list[0]):
X_train, X_test = char_list[0].iloc[train_index,:], char_list[0].iloc[test_index,:]
y_train, y_test = y_bin[train_index], y_bin[test_index]
clf_nb.fit(X_train, y_train)
nb_scores.append(clf_nb.predict_proba(X_test))
y_score = np.mean(nb_scores)
plt.cla()
# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(22):
fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
roc_auc[i] = auc(fpr[i], tpr[i])
for i in range(22):
plt.plot(fpr[i], tpr[i], lw=1,
label='ROC curve of class {0} (area = {1:0.2f})'
''.format(i, roc_auc[i]))
plt.plot([0, 1], [0, 1], 'k--', lw=1.5)
plt.xlim([-0.05, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic for Naive Bayes')
plt.legend(bbox_to_anchor=(1.0, 1.0))
plt.savefig('./output_pic/roc.png', bbox_inches = "tight")
\ No newline at end of file
File added
This diff is collapsed.
0.8813636363636363
0.8850
0.9341
0.9445
0.9459
0.9491
0.9509
0.9495
0.9495
0.9486
0.9477
0.9486
0.9482
0.9473
0.9477
0.9473
0.9482
0.9473
0.9477
0.9477
0.9477
0.9477
0.9477
0.9486
0.9486
0.9491
0.9486
0.9486
0.9486
0.9486
0.9491
0.9491
0.9491
\ No newline at end of file
0.8813636363636363
0.9268
0.9377
0.9382
0.9395
0.9405
0.9400
0.9409
0.9418
0.9409
0.9418
0.9409
0.9405
0.9432
0.9432
0.9436
0.9427
0.9427
0.9427
0.9427
0.9418
0.9418
0.9418
0.9418
0.9423
0.9432
0.9418
0.9427
0.9427
0.9427
0.9423
0.9427
0.9423
\ No newline at end of file
0.8813636363636363
0.92
0.9313636363636364
0.9395454545454546
0.9409090909090909
0.9418181818181818
0.9431818181818182
0.9413636363636364
0.9427272727272727
0.9431818181818182
0.9436363636363636
0.945
0.945
0.9445454545454546
0.945
0.945
0.9445454545454546
0.9436363636363636
0.9440909090909091
0.9454545454545454
0.9454545454545454
0.9459090909090909
0.9454545454545454
0.9445454545454546
0.9436363636363636
0.9440909090909091
0.9440909090909091
0.9436363636363636
0.9440909090909091
0.9436363636363636
0.9436363636363636
0.9445454545454546
0.9445454545454546
0.8813636363636363
0.9300
0.9418
0.9432
0.9464
0.9468
0.9468
0.9464
0.9477
0.9459
0.9468
0.9473
0.9459
0.9477
0.9482
0.9482
0.9473
0.9468
0.9482
0.9477
0.9482
0.9468
0.9468
0.9464
0.9459
0.9459
0.9459
0.9464
0.9464
0.9464
0.9459
0.9455
0.9477
\ No newline at end of file
import matplotlib.pyplot as plt
random = []
uncertainty = []
entropy = []
margin = []
f = open("./output_txt/char_random.txt")
for line in f:
text = line.strip()
random.append(float(text))
f = open("./output_txt/char_uncertainty.txt")
for line in f:
text = line.strip()
uncertainty.append(float(text))
f = open("./output_txt/char_margin.txt")
for line in f:
text = line.strip()
margin.append(float(text))
f = open("./output_txt/char_entropy.txt")
for line in f:
text = line.strip()
entropy.append(float(text))
plt.plot(list(range(1,34)),random,label='Random Sampling')
plt.plot(list(range(1,34)),uncertainty,label='Least Confidence')
plt.plot(list(range(1,34)),margin,label='Smallest Margin')
plt.plot(list(range(1,34)),entropy,label='Entropy')
plt.legend(loc='lower right')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.title('Accuracy for Active Learning vs. Random Sampling')
plt.savefig('./output_pic/performance.png')
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment