Commit 3af25822 authored by Danniene Wete's avatar Danniene Wete

plot accuracy over topic for lda+classification

parent 8152325e
......@@ -1347,7 +1347,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.4"
"version": "3.6.4"
}
},
"nbformat": 4,
......
......@@ -473,7 +473,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.4"
"version": "3.6.4"
}
},
"nbformat": 4,
......
......@@ -289,7 +289,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.4"
"version": "3.6.4"
}
},
"nbformat": 4,
......
......@@ -60,10 +60,10 @@
"metadata": {},
"outputs": [],
"source": [
"with open ('x_train.txt', 'rb') as fp: # 20 topics\n",
"with open ('x_trainRFC.txt', 'rb') as fp: # 20 topics\n",
" X_train = pickle.load(fp)\n",
"\n",
"with open ('x_test.txt', 'rb') as fp:\n",
"with open ('x_testRFC.txt', 'rb') as fp:\n",
" X_test = pickle.load(fp)\n"
]
},
......@@ -181,7 +181,8 @@
" if normalize:\n",
" plt.text(j, i, \"{:0.2f}\".format(conf_mat[i, j]), horizontalalignment=\"center\", color=\"black\")\n",
" else:\n",
" plt.text(j, i, \"{:}\".format(conf_mat[i, j]), horizontalalignment=\"center\", color=\"black\")"
" plt.text(j, i, \"{:}\".format(conf_mat[i, j]), horizontalalignment=\"center\", color=\"black\")\n",
"plt.savefig('confusion_matrixRFC.png')"
]
},
{
......@@ -208,7 +209,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.4"
"version": "3.6.4"
}
},
"nbformat": 4,
......
......@@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
"execution_count": null,
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
......@@ -25,7 +25,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
......@@ -35,7 +35,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
......@@ -45,9 +45,18 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 14,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\ProgramData\\Anaconda3\\lib\\site-packages\\smart_open\\smart_open_lib.py:398: UserWarning: This function is deprecated, use smart_open.open instead. See the migration notes for details: https://github.com/RaRe-Technologies/smart_open/blob/master/README.rst#migrating-to-the-new-open-function\n",
" 'See the migration notes for details: %s' % _MIGRATION_NOTES_URL\n"
]
}
],
"source": [
"#Load train vocabulary model\n",
"train_id2word = corpora.Dictionary.load('ldaModel/id2word_dictionary.gensim')"
......@@ -55,7 +64,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
......@@ -64,7 +73,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
......@@ -73,12 +82,22 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 17,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\ProgramData\\Anaconda3\\lib\\site-packages\\smart_open\\smart_open_lib.py:398: UserWarning: This function is deprecated, use smart_open.open instead. See the migration notes for details: https://github.com/RaRe-Technologies/smart_open/blob/master/README.rst#migrating-to-the-new-open-function\n",
" 'See the migration notes for details: %s' % _MIGRATION_NOTES_URL\n"
]
}
],
"source": [
"mallet_path = '/home/danniene/mallet/bin/mallet' # update this path\n",
"ntpc = 90 #SVM(90), RFC(30)\n",
"#mallet_path = '/home/danniene/mallet/bin/mallet' # update this path\n",
"mallet_path = 'C:\\\\mallet\\\\bin\\\\mallet'\n",
"ntpc = 30 #SVM(90), RFC(30)\n",
"alpha = 2\n",
"ldamallet = gensim.models.wrappers.LdaMallet(mallet_path, corpus=train_corpus,\n",
" alpha=alpha, num_topics=ntpc, id2word=train_id2word, random_seed=1046)"
......@@ -86,9 +105,18 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 18,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\ProgramData\\Anaconda3\\lib\\site-packages\\smart_open\\smart_open_lib.py:398: UserWarning: This function is deprecated, use smart_open.open instead. See the migration notes for details: https://github.com/RaRe-Technologies/smart_open/blob/master/README.rst#migrating-to-the-new-open-function\n",
" 'See the migration notes for details: %s' % _MIGRATION_NOTES_URL\n"
]
}
],
"source": [
"# save model on disk\n",
"ldamallet.save('ldaModel/optimal_lda_model.gensim')"
......@@ -96,14 +124,23 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 19,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\ProgramData\\Anaconda3\\lib\\site-packages\\smart_open\\smart_open_lib.py:398: UserWarning: This function is deprecated, use smart_open.open instead. See the migration notes for details: https://github.com/RaRe-Technologies/smart_open/blob/master/README.rst#migrating-to-the-new-open-function\n",
" 'See the migration notes for details: %s' % _MIGRATION_NOTES_URL\n"
]
}
],
"source": [
"train_topics = ldamallet[train_corpus]\n",
"train_topicList = []\n",
"for i in range(0, len(train_topics)):\n",
" dt = np.zeros(80)\n",
" dt = np.zeros(30)\n",
" for j in train_topics[i]:\n",
" dt[j[0]] = j[1]\n",
" \n",
......@@ -113,14 +150,23 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 20,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\ProgramData\\Anaconda3\\lib\\site-packages\\smart_open\\smart_open_lib.py:398: UserWarning: This function is deprecated, use smart_open.open instead. See the migration notes for details: https://github.com/RaRe-Technologies/smart_open/blob/master/README.rst#migrating-to-the-new-open-function\n",
" 'See the migration notes for details: %s' % _MIGRATION_NOTES_URL\n"
]
}
],
"source": [
"test_topics = ldamallet[test_corpus]\n",
"test_topicList = []\n",
"for i in range(0, len(test_topics)):\n",
" dt = np.zeros(80)\n",
" dt = np.zeros(30)\n",
" for j in test_topics[i]:\n",
" dt[j[0]] = j[1]\n",
" \n",
......@@ -130,15 +176,15 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"# serialize a train_docs to disk for later use\n",
"with open('x_trainSVM.txt', 'wb') as fp:\n",
"with open('x_trainRFC.txt', 'wb') as fp:\n",
" pickle.dump(x_train, fp)\n",
"\n",
"with open('x_testSVM.txt', 'wb') as fp:\n",
"with open('x_testRFC.txt', 'wb') as fp:\n",
" pickle.dump(x_test, fp)"
]
}
......@@ -159,7 +205,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.4"
"version": "3.6.4"
}
},
"nbformat": 4,
......
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import itertools\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"from sklearn import svm\n",
"from sklearn.svm import SVC\n",
"from sklearn.metrics import classification_report, confusion_matrix\n",
"from sklearn.model_selection import GridSearchCV\n",
"import pickle\n",
"\n",
"plt.style.use('bmh')\n",
"%matplotlib inline\n",
"\n",
"\n",
"def select_svc_params(X_train, y_train, folds):\n",
" \"\"\"Hyperparameter optimization\n",
"\n",
" Args:\n",
" X_train, y_train: corresponding dataset and labels from load_data\n",
" Returns:\n",
" grid_search.best_params_: Best Parameters for SVC\n",
" \"\"\"\n",
" gamma = [0.001, 0.01, 0.1, 1]\n",
" C = [0.001, 0.01, 0.1, 1, 10]\n",
" params = {'C': C, 'gamma' : gamma}\n",
" svm = SVC()\n",
" grid_search = GridSearchCV(svm, params, cv=folds, iid=False)\n",
" grid_search.fit(X_train, y_train)\n",
" return grid_search.best_params_\n",
"\n",
"def run_svc(X_train, y_train, X_test, y_test):\n",
" \"\"\"Support Vector Machine Classifier\n",
"\n",
" Args:\n",
" X_train, y_train, X_test, y_test: trained model and testing data\n",
" Returns:\n",
" score, y_pred: Accuracy and prediction labels\n",
" \"\"\"\n",
" svc = SVC(C=best_params[\"C\"], gamma=best_params[\"gamma\"])\n",
" svc.fit(X_train, y_train)\n",
" y_pred = svc.predict(X_test)\n",
" score = svc.score(X_test, y_test)\n",
" return score, y_pred"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"y_train = np.loadtxt(\"../../data/y_train.txt\")\n",
"y_train = np.array([int(i)-1 for i in y_train])\n",
"\n",
"y_test = np.loadtxt(\"../../data/y_test.txt\")\n",
"y_test = np.array([int(i)-1 for i in y_test])"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"with open ('x_trainSVM.txt', 'rb') as fp:\n",
" X_train = pickle.load(fp)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"with open ('x_testSVM.txt', 'rb') as fp:\n",
" X_test = pickle.load(fp)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(2947, 90)"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X_test.shape"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#hyperparameter optimization\n",
"best_params = select_svc_params(X_train, y_train, 10)\n",
"#SVM classifier\n",
"score, y_pred = run_svc(X_train, y_train, X_test, y_test)\n",
"print('%.4f' % (score))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"y_pred[:10]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"y_test[:10]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"labels = {0:'WALKING',1:'WALKING_UPSTAIRS',2:'WALKING_DOWNSTAIRS',3:'SITTING',4:'STANDING',5:'LAYING'}\n",
"print(classification_report(y_test, y_pred, target_names=list(labels.values())))\n",
"conf_mat = confusion_matrix(y_test, y_pred)\n",
"\n",
"fig = plt.figure(figsize=(20,20))\n",
"plt.imshow(conf_mat, cmap=plt.cm.hot, interpolation='nearest')\n",
"plt.colorbar()\n",
"plt.title('Confusion Matrix')\n",
"plt.ylabel('True Labels')\n",
"plt.xlabel('Predicted labels')\n",
"plt.xticks(range(len(labels.values())), [l for l in labels.values()], rotation = 90)\n",
"plt.yticks(range(len(labels.values())), [l for l in labels.values()])\n",
"\n",
"#activate normalization confusion matrix\n",
"normalize = False\n",
"\n",
"if normalize:\n",
" conf_mat = conf_mat.astype('float') / conf_mat.sum(axis=1)[:, np.newaxis]\n",
"\n",
"for i, j in itertools.product(range(conf_mat.shape[0]), range(conf_mat.shape[1])):\n",
" if conf_mat[i, j] > 0:\n",
" if normalize:\n",
" plt.text(j, i, \"{:0.2f}\".format(conf_mat[i, j]), horizontalalignment=\"center\", color=\"black\")\n",
" else:\n",
" plt.text(j, i, \"{:}\".format(conf_mat[i, j]), horizontalalignment=\"center\", color=\"black\")\n",
"plt.savefig('confusion_matrixSVM.png')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
This diff is collapsed.
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import itertools\n",
"import pickle\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier\n",
"from sklearn.metrics import classification_report, confusion_matrix, precision_recall_fscore_support\n",
"from sklearn.model_selection import GridSearchCV\n",
"\n",
"plt.style.use('bmh')\n",
"%matplotlib inline\n",
"\n",
"\n",
"\n",
"\n",
"def select_rfc_params(X_train, y_train, folds):\n",
" \"\"\"Hyperparameter optimization\n",
"\n",
" Args:\n",
" X_train, y_train: corresponding dataset and labels from load_data\n",
" Returns:\n",
" grid_search.best_params_: Best Parameters for SVC\n",
" \"\"\"\n",
" estimator_array = range(50,200,10)\n",
" depth_array = range(2,15)\n",
" params = {'n_estimators': estimator_array, 'max_depth': depth_array}\n",
" # Create a based model\n",
" rf = RandomForestClassifier()\n",
" # Instantiate the grid search model\n",
" grid_search = GridSearchCV(estimator=rf, param_grid=params, cv=folds)\n",
" grid_search.fit(X_train, y_train)\n",
" return grid_search.best_params_\n",
"\n",
"#RandomForest classifier\n",
"def rfc(X_train, y_train, X_test, y_test, best_params):\n",
" \"\"\"Random Forest Classifier\n",
"\n",
" Args:\n",
" X_train, y_train, X_test, y_test: trained model and testing data\n",
" Returns:\n",
" score, y_pred: Accuracy and prediction labels\n",
" \"\"\"\n",
" rfc = RandomForestClassifier(n_estimators=best_params[\"n_estimators\"], max_depth=best_params[\"max_depth\"])\n",
" rfc.fit(X_train, y_train)\n",
" y_pred = rfc.predict(X_test)\n",
" score = rfc.score(X_test, y_test)\n",
" return score, y_pred"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"with open ('x_trainRFC.txt', 'rb') as fp: # 20 topics\n",
" X_train = pickle.load(fp)\n",
"\n",
"with open ('x_testRFC.txt', 'rb') as fp:\n",
" X_test = pickle.load(fp)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"y_train = np.loadtxt(\"../../data/y_train.txt\")\n",
"y_train = np.array([int(i)-1 for i in y_train])\n",
"y_test = np.loadtxt(\"../../data/y_test.txt\")\n",
"y_test = np.array([int(i)-1 for i in y_test])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#hyperparameter optimization\n",
"best_params = select_rfc_params(X_train, y_train, 10)\n",
"score, y_pred = rfc(X_train, y_train, X_test, y_test, best_params)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print('%.4f' % (score))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"labels = {0:'WALKING',1:'WALKING_UPSTAIRS',2:'WALKING_DOWNSTAIRS',3:'SITTING',4:'STANDING',5:'LAYING'}\n",
"print(classification_report(y_test, y_pred, target_names=list(labels.values())))\n",
"conf_mat = confusion_matrix(y_test, y_pred)\n",
"\n",
"fig = plt.figure(figsize=(20,20))\n",
"plt.imshow(conf_mat, cmap=plt.cm.hot, interpolation='nearest')\n",
"plt.colorbar()\n",
"plt.title('Confusion Matrix')\n",
"plt.ylabel('Ground Truth Labels')\n",
"plt.xlabel('Predicted labels')\n",
"plt.xticks(range(len(labels.values())), [l for l in labels.values()], rotation = 90)\n",
"plt.yticks(range(len(labels.values())), [l for l in labels.values()])\n",
"\n",
"#activate normalization confusion matrix\n",
"normalize = False\n",
"\n",
"if normalize:\n",
" conf_mat = conf_mat.astype('float') / conf_mat.sum(axis=1)[:, np.newaxis]\n",
"\n",
"for i, j in itertools.product(range(conf_mat.shape[0]), range(conf_mat.shape[1])):\n",
" if conf_mat[i, j] > 0:\n",
" if normalize:\n",
" plt.text(j, i, \"{:0.2f}\".format(conf_mat[i, j]), horizontalalignment=\"center\", color=\"black\")\n",
" else:\n",
" plt.text(j, i, \"{:}\".format(conf_mat[i, j]), horizontalalignment=\"center\", color=\"black\")\n",
"plt.savefig('confusion_matrixRFC.png')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
......@@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
"execution_count": null,
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
......@@ -25,7 +25,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
......@@ -35,7 +35,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
......@@ -45,9 +45,18 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 4,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\ProgramData\\Anaconda3\\lib\\site-packages\\smart_open\\smart_open_lib.py:398: UserWarning: This function is deprecated, use smart_open.open instead. See the migration notes for details: https://github.com/RaRe-Technologies/smart_open/blob/master/README.rst#migrating-to-the-new-open-function\n",
" 'See the migration notes for details: %s' % _MIGRATION_NOTES_URL\n"
]
}
],
"source": [
"#Load train vocabulary model\n",
"train_id2word = corpora.Dictionary.load('ldaModel/id2word_dictionary.gensim')"
......@@ -55,7 +64,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
......@@ -64,7 +73,7 @@
},
{
"cell_type":