...
 
Commits (2)
......@@ -21,6 +21,7 @@
"gyro_z = np.loadtxt('../../data/body_gyro_z_train.txt')\n",
"\n",
"gyro_z.shape\n",
"\n",
"#Load test data\n",
"\n",
"acc_xtest = np.loadtxt('../../data/total_acc_x_test.txt')\n",
......@@ -37,7 +38,7 @@
"metadata": {},
"outputs": [],
"source": [
"# Combine all 3 channels data to form one 3D matrix of data\n",
"#Combine the 3 channels of each instance to form one 3D matrix, but this must not be done.\n",
"def combineData(X, Y, Z):\n",
" combinedData = []\n",
" \n",
......@@ -104,7 +105,7 @@
" return np.array(centroids_array)\n",
"\n",
"\n",
"# Map centroids to chars\n",
"# Assign each centroids to chars in order to create a sensory alphabet\n",
"\n",
"def centroidsToChars(centroids):\n",
" alphabet = {0:'a', 1:'b', 2:'c', 3:'d', 4:'e', 5:'f', 6:'g', 7:'h', 8:'i', 9:'j', 10:'k', 11:'l',\n",
......@@ -124,6 +125,7 @@
" \n",
" return np.array(centroids_Array)\n",
"\n",
"\n",
"# Map subsequences to chars\n",
"def mapCodewordsToChars(X, centroids_array, activity_count): #X: 3D matrix with subsequences for each channel\n",
" \n",
......@@ -154,7 +156,7 @@
" \n",
" return np.array(codeword_index_array)\n",
"\n",
"# Create words\n",
"# This method create sensory words using the sensory alphabet of subsequences.\n",
"def create_words(acc, gyr):\n",
" \n",
" x_acc = acc[0]\n",
......@@ -209,7 +211,7 @@
"metadata": {},
"outputs": [],
"source": [
"# Process training and testing data\n",
"# Process training and testing data to create the BoW of each data set.\n",
"\n",
"def codebook_approach(window_length, n_cluster):\n",
" \"\"\"Codebook Approach\n",
......@@ -218,7 +220,7 @@
" window_length\n",
" n_cluster\n",
" Returns:\n",
" X_train, X_test\n",
" trainBow, testBow\n",
" \"\"\" \n",
" \n",
" # 1. Sliding window approach\n",
......
......@@ -23,7 +23,7 @@
"from sklearn.metrics import precision_recall_fscore_support, accuracy_score\n",
"from sklearn.metrics.cluster import contingency_matrix\n",
"from sklearn.metrics import adjusted_rand_score\n",
"%run createSensoryWords_fs4.ipynb\n",
"%run createSensoryWords.ipynb\n",
"\n",
"\n",
"# Function for getting the topic with the maximal proportion in each document.\n",
......@@ -45,7 +45,7 @@
" return np.array(topicList)\n",
" \n",
"\n",
"# Function for mapping topic to class\n",
"# Function for mapping topics to activity classes\n",
"def topic_toClass_mapping(dataframe):\n",
" \n",
" mapping = {} # dictionnary with topic ID as key and the class ID as value.\n",
......@@ -145,7 +145,7 @@
"metadata": {},
"outputs": [],
"source": [
"# Find the window length and n_clusters combination with highest accuracy\n",
"# Find the window length and n_clusters combination that generates the highest accuracy value\n",
"\n",
"for window_length in range(10, 40, 5):\n",
" for n_clusters in range(14, 70, 3): \n",
......
......@@ -21,6 +21,7 @@
"gyro_z = np.loadtxt('../../data/body_gyro_z_train.txt')\n",
"\n",
"gyro_z.shape\n",
"\n",
"#Load test data\n",
"\n",
"acc_xtest = np.loadtxt('../../data/total_acc_x_test.txt')\n",
......@@ -37,7 +38,7 @@
"metadata": {},
"outputs": [],
"source": [
"# Combine all 3 channels data to form one 3D matrix of data\n",
"#Combine the 3 channels of each instance to form one 3D matrix, but this must not be done.\n",
"def combineData(X, Y, Z):\n",
" combinedData = []\n",
" \n",
......@@ -104,7 +105,7 @@
" return np.array(centroids_array)\n",
"\n",
"\n",
"# Map centroids to chars\n",
"# Assign each centroids to chars in order to create a sensory alphabet\n",
"\n",
"def centroidsToChars(centroids):\n",
" alphabet = {0:'a', 1:'b', 2:'c', 3:'d', 4:'e', 5:'f', 6:'g', 7:'h', 8:'i', 9:'j', 10:'k', 11:'l',\n",
......@@ -124,6 +125,7 @@
" \n",
" return np.array(centroids_Array)\n",
"\n",
"\n",
"# Map subsequences to chars\n",
"def mapCodewordsToChars(X, centroids_array, activity_count): #X: 3D matrix with subsequences for each channel\n",
" \n",
......@@ -154,7 +156,7 @@
" \n",
" return np.array(codeword_index_array)\n",
"\n",
"# Create words\n",
"# This method create sensory words using the sensory alphabet of subsequences.\n",
"def create_words(acc, gyr):\n",
" \n",
" x_acc = acc[0]\n",
......@@ -209,7 +211,7 @@
"metadata": {},
"outputs": [],
"source": [
"# Process training and testing data\n",
"# Process training and testing data to create the BoW of each data set.\n",
"\n",
"def codebook_approach(window_length, n_cluster):\n",
" \"\"\"Codebook Approach\n",
......@@ -218,7 +220,7 @@
" window_length\n",
" n_cluster\n",
" Returns:\n",
" X_train, X_test\n",
" trainBow, testBow\n",
" \"\"\" \n",
" \n",
" # 1. Sliding window approach\n",
......
......@@ -23,7 +23,7 @@
"from sklearn.metrics import precision_recall_fscore_support, accuracy_score\n",
"from sklearn.metrics.cluster import contingency_matrix\n",
"from sklearn.metrics import adjusted_rand_score\n",
"%run createSensoryWords_fs4.ipynb\n",
"%run createSensoryWords.ipynb\n",
"\n",
"\n",
"# Function for getting the topic with the maximal proportion in each document.\n",
......@@ -45,7 +45,7 @@
" return np.array(topicList)\n",
" \n",
"\n",
"# Function for mapping topic to class\n",
"# Function for mapping topics to activity classes\n",
"def topic_toClass_mapping(dataframe):\n",
" \n",
" mapping = {} # dictionnary with topic ID as key and the class ID as value.\n",
......@@ -145,7 +145,7 @@
"metadata": {},
"outputs": [],
"source": [
"# Find the window length and n_clusters combination with highest accuracy\n",
"# Find the window length and n_clusters combination that generates the highest accuracy value\n",
"\n",
"for window_length in range(10, 40, 5):\n",
" for n_clusters in range(14, 70, 3): \n",
......
......@@ -6,7 +6,7 @@
"metadata": {},
"outputs": [],
"source": [
"%run createSensoryWords_fs4.ipynb\n",
"%run createSensoryWords.ipynb\n",
"\n",
"import numpy as np\n",
"import pickle\n",
......@@ -88,9 +88,9 @@
"metadata": {},
"outputs": [],
"source": [
"# Create several lda models and apply them on train and test data\n",
"# Creates lda \n",
"\n",
"def SearchBestModel(corpus_train, train_vocab, rnd_numb):\n",
"def createLdaModel(corpus_train, train_vocab, rnd_numb):\n",
" mallet_path = '/home/danniene/mallet/bin/mallet'\n",
" ntpc = 6\n",
" alpha = 0.01\n",
......@@ -113,7 +113,7 @@
"metadata": {},
"outputs": [],
"source": [
"# Function for mapping topic to class\n",
"# Function for mapping topics to activity classes\n",
"\n",
"def topic_toClass_mapping(dataframe):\n",
" \n",
......@@ -207,9 +207,10 @@
}
],
"source": [
"# Optimization of the random seed parameter: find the random seed number that gives the best result with lda.\n",
"for rdst in range(0, 1500):\n",
" \n",
" pred_topics = SearchBestModel(train_corpus, train_id2word, rdst)\n",
" pred_topics = createLdaModel(train_corpus, train_id2word, rdst)\n",
" CM = contingency_matrix(y_train, pred_topics)\n",
" df_document_topics = pd.DataFrame(CM, index=['Class'+str(i) for i in list(set(y_train))], \n",
" columns=['Top'+str(i) for i in list(set(pred_topics))])\n",
......
......@@ -29,7 +29,7 @@
"metadata": {},
"outputs": [],
"source": [
"# load train corpus from disk\n",
"# load the BoW of the training data from disk\n",
"with open ('trainTestCorpus/corpus.train', 'rb') as fp:\n",
" train_docs = pickle.load(fp)"
]
......@@ -136,7 +136,7 @@
}
],
"source": [
"# Plot the the distribution of SW in one sample documents.\n",
"# Plot the the distribution of SW in one activity instance.\n",
"doc_m5 = train_docs[847]\n",
"counter1 = collections.Counter(doc_m5)\n",
"counter1 = dict(counter1)\n",
......@@ -256,6 +256,7 @@
}
],
"source": [
"#plot the distribution of the 20 most prominent SW in the training data\n",
"x_pos = range(len(words[:20]))\n",
"xlabels = ['SW'+str(i+1) for i in range(20)]\n",
"plt.figure(figsize=(6,4))\n",
......@@ -531,6 +532,8 @@
"metadata": {},
"outputs": [],
"source": [
"#Incrementally remove most frequent SW and text how lda perform afterward.\n",
"\n",
"for idftreshold in np.linspace(0.0, 2.0, num=10):\n",
" discoveredStopwords = [word for word in idfDict_test.keys() if idfDict_test[word] <= float(idftreshold)]\n",
" percentageOfRemovedWords = len(discoveredStopwords)/len(idfDict_test)\n",
......