Commit e13a304c authored by Danniene Wete's avatar Danniene Wete

rename folders and move around files

parent fa0abc6e
......@@ -10,7 +10,6 @@
"import matplotlib.pyplot as plt\n",
"import math\n",
"from sklearn.cluster import KMeans\n",
"from sklearn.cluster import MiniBatchKMeans\n",
"from sklearn.preprocessing import MinMaxScaler\n",
"from scipy import stats\n"
]
......@@ -470,13 +469,6 @@
"source": [
"vocabulary[6]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"###### inertia tells how far away the points within a cluster are. Therefore, a small of inertia is aimed for. The range of inertia’s value starts from zero and goes up. Silhouette score: Silhouette score tells how far away the datapoints in one cluster are, from the datapoints in another cluster. The range of silhouette score is from -1 to 1. Score should be closer to 1 than -1"
]
}
],
"metadata": {
......
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"✓ [[1]]\n",
"✓ [[0], [3, 4]]\n",
"✓ [[-3, 0], [4]]\n",
"✓ [[1, 1, 1, 1]]\n",
"✓ [[1], [2], [3]]\n",
"✓ [[1], [2, 2], [3]]\n",
"✓ [[1], [2, 2], [3, 3]]\n",
"✓ [[1], [2, 2], [3, 3]]\n",
"✓ [[1], [2, 2], [3, 3]]\n",
"✓ [[1, 2, 2], [3, 3], [5]]\n",
"✓ [[0, 1, 2], [100, 101, 103]]\n",
"✓ [[0, 1, 2], [50], [100, 101, 103]]\n",
"✓ [[-1, -1, -1, -1], [2, 2, 2], [4, 5, 6]]\n"
]
}
],
"source": [
"# This code was taken from https://github.com/rocketrip/ckmeans\n",
"import numpy as np\n",
"\n",
"def ssq(j, i, sum_x, sum_x_sq):\n",
" if (j > 0):\n",
" muji = (sum_x[i] - sum_x[j-1]) / (i - j + 1)\n",
" sji = sum_x_sq[i] - sum_x_sq[j-1] - (i - j + 1) * muji ** 2\n",
" else:\n",
" sji = sum_x_sq[i] - sum_x[i] ** 2 / (i+1)\n",
"\n",
" return 0 if sji < 0 else sji\n",
"\n",
"def fill_row_k(imin, imax, k, S, J, sum_x, sum_x_sq, N):\n",
" if imin > imax: return\n",
"\n",
" i = (imin+imax) // 2\n",
" S[k][i] = S[k-1][i-1]\n",
" J[k][i] = i\n",
"\n",
" jlow = k\n",
"\n",
" if imin > k:\n",
" jlow = int(max(jlow, J[k][imin-1]))\n",
" jlow = int(max(jlow, J[k-1][i]))\n",
"\n",
" jhigh = i-1\n",
" if imax < N-1:\n",
" jhigh = int(min(jhigh, J[k][imax+1]))\n",
"\n",
" for j in range(jhigh, jlow-1, -1):\n",
" sji = ssq(j, i, sum_x, sum_x_sq)\n",
"\n",
" if sji + S[k-1][jlow-1] >= S[k][i]: break\n",
"\n",
" # Examine the lower bound of the cluster border\n",
" # compute s(jlow, i)\n",
" sjlowi = ssq(jlow, i, sum_x, sum_x_sq)\n",
"\n",
" SSQ_jlow = sjlowi + S[k-1][jlow-1]\n",
"\n",
" if SSQ_jlow < S[k][i]:\n",
" S[k][i] = SSQ_jlow\n",
" J[k][i] = jlow\n",
"\n",
" jlow += 1\n",
"\n",
" SSQ_j = sji + S[k-1][j-1]\n",
" if SSQ_j < S[k][i]:\n",
" S[k][i] = SSQ_j\n",
" J[k][i] = j\n",
"\n",
" fill_row_k(imin, i-1, k, S, J, sum_x, sum_x_sq, N)\n",
" fill_row_k(i+1, imax, k, S, J, sum_x, sum_x_sq, N)\n",
"\n",
"def fill_dp_matrix(data, S, J, K, N):\n",
" sum_x = np.zeros(N, dtype=np.float_)\n",
" sum_x_sq = np.zeros(N, dtype=np.float_)\n",
"\n",
" # median. used to shift the values of x to improve numerical stability\n",
" shift = data[N//2]\n",
"\n",
" for i in range(N):\n",
" if i == 0:\n",
" sum_x[0] = data[0] - shift\n",
" sum_x_sq[0] = (data[0] - shift) ** 2\n",
" else:\n",
" sum_x[i] = sum_x[i-1] + data[i] - shift\n",
" sum_x_sq[i] = sum_x_sq[i-1] + (data[i] - shift) ** 2\n",
"\n",
" S[0][i] = ssq(0, i, sum_x, sum_x_sq)\n",
" J[0][i] = 0\n",
"\n",
" for k in range(1, K):\n",
" if (k < K-1):\n",
" imin = max(1, k)\n",
" else:\n",
" imin = N-1\n",
"\n",
" fill_row_k(imin, N-1, k, S, J, sum_x, sum_x_sq, N)\n",
"\n",
"def ckmeans(data, n_clusters):\n",
" if n_clusters <= 0:\n",
" raise ValueError(\"Cannot classify into 0 or less clusters\")\n",
" if n_clusters > len(data):\n",
" raise ValueError(\"Cannot generate more classes than there are data values\")\n",
"\n",
" # if there's only one value, return it; there's no sensible way to split\n",
" # it. This means that len(ckmeans([data], 2)) may not == 2. Is that OK?\n",
" unique = len(set(data))\n",
" if unique == 1:\n",
" return [data]\n",
"\n",
" data.sort()\n",
" n = len(data)\n",
"\n",
" S = np.zeros((n_clusters, n), dtype=np.float_)\n",
"\n",
" J = np.zeros((n_clusters, n), dtype=np.uint64)\n",
"\n",
" fill_dp_matrix(data, S, J, n_clusters, n)\n",
"\n",
" clusters = []\n",
" cluster_right = n-1\n",
"\n",
" for cluster in range(n_clusters-1, -1, -1):\n",
" cluster_left = int(J[cluster][cluster_right])\n",
" clusters.append(data[cluster_left:cluster_right+1])\n",
"\n",
" if cluster > 0:\n",
" cluster_right = cluster_left - 1\n",
"\n",
" return list(reversed(clusters))\n",
"\n",
"##\n",
"## HELPER CODE FOR TESTS\n",
"##\n",
"\n",
"# partition recipe modified from\n",
"# http://wordaligned.org/articles/partitioning-with-python\n",
"from itertools import chain, combinations\n",
"\n",
"def sliceable(xs):\n",
" '''Return a sliceable version of the iterable xs.'''\n",
" try:\n",
" xs[:0]\n",
" return xs\n",
" except TypeError:\n",
" return tuple(xs)\n",
"\n",
"def partition_n(iterable, n):\n",
" s = sliceable(iterable)\n",
" l = len(s)\n",
" b, mid, e = [0], list(range(1, l)), [l]\n",
" getslice = s.__getitem__\n",
" splits = (d for i in range(l) for d in combinations(mid, n-1))\n",
" return [[s[sl] for sl in map(slice, chain(b, d), chain(d, e))]\n",
" for d in splits]\n",
"\n",
"def squared_distance(part):\n",
" mean = sum(part)/len(part)\n",
" return sum((x-mean)**2 for x in part)\n",
"\n",
"# given a partition, return the sum of the squared distances of each part\n",
"def sum_of_squared_distances(partition):\n",
" return sum(squared_distance(part) for part in partition)\n",
"\n",
"# brute force the correct answer by testing every partition.\n",
"def min_squared_distance(data, n):\n",
" return min((sum_of_squared_distances(partition), partition)\n",
" for partition in partition_n(data, n))\n",
"\n",
"if __name__ == \"__main__\":\n",
" try:\n",
" ckmeans([], 10)\n",
" 1/0\n",
" except ValueError:\n",
" pass\n",
"\n",
" tests = [\n",
" (([1], 1), [[1]]),\n",
" (([0,3,4], 2), [[0], [3,4]]),\n",
" (([-3,0,4], 2), [[-3,0], [4]]),\n",
" (([1,1,1,1], 1), [[1,1,1,1]]),\n",
" (([1,2,3], 3), [[1], [2], [3]]),\n",
" (([1,2,2,3], 3), [[1], [2,2], [3]]),\n",
" (([1,2,2,3,3], 3), [[1], [2,2], [3,3]]),\n",
" (([1,2,3,2,3], 3), [[1], [2,2], [3,3]]),\n",
" (([3,2,3,2,1], 3), [[1], [2,2], [3,3]]),\n",
" (([3,2,3,5,2,1], 3), [[1,2,2], [3,3], [5]]),\n",
" (([0,1,2,100,101,103], 2), [[0,1,2], [100,101,103]]),\n",
" (([0,1,2,50,100,101,103], 3), [[0,1,2], [50], [100,101,103]]),\n",
" (([-1,2,-1,2,4,5,6,-1,2,-1], 3),\n",
" [[-1, -1, -1, -1], [2, 2, 2], [4, 5, 6]]),\n",
" ]\n",
"\n",
" for test in tests:\n",
" args, expected = test\n",
" try:\n",
" result = ckmeans(*args)\n",
" except:\n",
" print(\"✗ {}, {}\".format(args[0], args[1], result))\n",
" raise\n",
" errormsg = \"✗ ckmeans({}) = {} != {}\\n{} > {}\".format(\n",
" args, result, expected,\n",
" sum_of_squared_distances(result),\n",
" sum_of_squared_distances(expected))\n",
" assert np.array_equal(result, expected), errormsg\n",
" print(\"✓ {}\".format(result))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.1"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
......@@ -4,13 +4,14 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"### Features extracted: mean, slope, and standard deviation of each subsequence.\n",
"### Feature vectors of subsequences in the same time window along the three channels of each sensor are concatenated. Feature vectors are normalized to get all features on the same scale."
"Features extracted: mean, slope, standard deviation, interquartile range, max, min from subsequences.\n",
"Feature vectors in the same time window are concatenated in channel combinations.\n",
"Feature vectors are normalized."
]
},
{
"cell_type": "code",
"execution_count": 23,
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
......@@ -18,14 +19,13 @@
"import matplotlib.pyplot as plt\n",
"import math\n",
"#from sklearn.cluster import KMeans\n",
"from sklearn.cluster import MiniBatchKMeans\n",
"from sklearn.preprocessing import MinMaxScaler\n",
"from scipy import stats\n"
]
},
{
"cell_type": "code",
"execution_count": 24,
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
......@@ -109,7 +109,7 @@
},
{
"cell_type": "code",
"execution_count": 25,
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
......@@ -125,10 +125,10 @@
" std_x = np.std(x)\n",
" return std_x\n",
" \n",
"def mad(x):\n",
" \"\"\"Calculates median absolute deviation\"\"\"\n",
" mad_x = np.median(np.abs(x - np.median(x)))\n",
" return mad_x\n",
"#def mad(x):\n",
" # \"\"\"Calculates median absolute deviation\"\"\"\n",
" # mad_x = np.median(np.abs(x - np.median(x)))\n",
" # return mad_x\n",
"\n",
"def minimum(x):\n",
" \"\"\"Calculates minimum\"\"\"\n",
......@@ -138,15 +138,15 @@
" \"\"\"Calculates maximum\"\"\"\n",
" return max(x)\n",
"\n",
"def energy_measure(x):\n",
"#def energy_measure(x):\n",
" \"\"\"Calculates energy measures\"\"\"\n",
" em_x = np.mean(np.square(x))\n",
" return em_x\n",
" # em_x = np.mean(np.square(x))\n",
" # return em_x\n",
"\n",
"def max_index(x):\n",
" \"\"\"Calculates index of maximum\"\"\"\n",
" idx_x = np.argmax(x)\n",
" return idx_x\n",
"#def max_index(x):\n",
" # \"\"\"Calculates index of maximum\"\"\"\n",
" #idx_x = np.argmax(x)\n",
" #return idx_x\n",
"\n",
"def inter_quartile_range(x):\n",
" \"\"\"Calculates inter-quartile range\"\"\"\n",
......@@ -196,14 +196,15 @@
" current_subsequence = X[c][i]\n",
" mean_c = mean(current_subsequence)\n",
" slope_c = get_Orientation(current_subsequence)\n",
" #std_c = std_dev(current_subsequence)\n",
" std_c = std_dev(current_subsequence)\n",
" #mad_c = mad(current_subsequence)\n",
" #min_c = minimum(current_subsequence)\n",
" #max_c = maximum(current_subsequence)\n",
" min_c = minimum(current_subsequence)\n",
" max_c = maximum(current_subsequence)\n",
" iqr_c = inter_quartile_range(current_subsequence)\n",
" #em_c = energy_measure(current_subsequence)\n",
" ##line_c = get_lineBestFit(current_subsequence)\n",
" #stdrr_c = get_stderr(current_subsequence)\n",
" Features_array[c].append([slope_c, mean_c])\n",
" Features_array[c].append([slope_c, mean_c, std_c, min_c, max_c, iqr_c])\n",
" Features_array = np.array(Features_array)\n",
" \n",
" Features_array_normalized = [[], [],[]]\n",
......@@ -299,7 +300,7 @@
},
{
"cell_type": "code",
"execution_count": 26,
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
......@@ -330,43 +331,43 @@
},
{
"cell_type": "code",
"execution_count": 27,
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"train_corpus, train_vocabulary = runFeaturesExtraction(10) # without scaling, window length = 30"
"train_corpus, train_vocabulary = runFeaturesExtraction(30)"
]
},
{
"cell_type": "code",
"execution_count": 32,
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(264672, 4)\n"
"(308784, 12)\n"
]
}
],
"source": [
"train_vocabulary[578] # array([ 4.60539523e-03, 1.02079573e+00, 1.29865663e-02, -4.59548657e-04]) without normalization\n",
"train_vocabulary[578] \n",
"print(train_vocabulary.shape)\n"
]
},
{
"cell_type": "code",
"execution_count": 29,
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(7352, 36)"
"(7352, 42)"
]
},
"execution_count": 29,
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
......@@ -377,7 +378,7 @@
},
{
"cell_type": "code",
"execution_count": 30,
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
......@@ -395,7 +396,7 @@
},
{
"cell_type": "code",
"execution_count": 31,
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
......
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import itertools\n",
"import matplotlib.pyplot as plt\n",
"from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier\n",
"from sklearn.metrics import classification_report, confusion_matrix, precision_recall_fscore_support\n",
"from sklearn.model_selection import GridSearchCV\n",
"\n",
"plt.style.use('bmh')\n",
"%matplotlib inline\n",
"%run getFeatures_CB2.ipynb\n",
"\n",
"\n",
"\n",
"def select_rfc_params(X_train, y_train, folds):\n",
" \"\"\"Hyperparameter optimization\n",
"\n",
" Args:\n",
" X_train, y_train: corresponding dataset and labels from load_data\n",
" Returns:\n",
" grid_search.best_params_: Best Parameters for SVC\n",
" \"\"\"\n",
" estimator_array = range(50,200,10)\n",
" depth_array = range(2,15)\n",
" params = {'n_estimators': estimator_array, 'max_depth': depth_array}\n",
" # Create a based model\n",
" rf = RandomForestClassifier()\n",
" # Instantiate the grid search model\n",
" grid_search = GridSearchCV(estimator=rf, param_grid=params, cv=folds)\n",
" grid_search.fit(X_train, y_train)\n",