{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Workbook for Eye Tracking Data" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Import Libraries" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "tags": [] }, "outputs": [], "source": [ "# Basics\n", "import numpy as np\n", "import os\n", "import math\n", "\n", "# Plotting\n", "import matplotlib.pyplot as plt\n", "import ipympl\n", "\n", "# Data processing\n", "import pandas as pd\n", "import awkward as ak\n", "\n", "# ML\n", "# from sklearn import \n", "# patch_sklearn()\n", "from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold\n", "from sklearn.svm import SVC\n", "\n", "# Misc\n", "from pathlib import Path\n", "import pyarrow as pa\n", "import urllib.request\n", "from itertools import chain, combinations" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Import data" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Import users and their score" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "tags": [] }, "outputs": [], "source": [ "# Imports the user score information\n", "user_data = pd.read_csv(r\"data/scores_WtG_PrePost.csv\", delimiter=\",\", usecols=[\"User\", \"Pre score\", \"Post score\", \"Difference\", \"Group cat\"])" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "tags": [] }, "outputs": [], "source": [ "# Defines my directory with the user data\n", "user_dir = 'data/with ET'" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "tags": [] }, "outputs": [], "source": [ "# Filters and drops non-relevant users\n", "def strip_non_relevant(user_data):\n", " to_drop = []\n", " for i, cat in enumerate(user_data[\"Group cat\"]):\n", " if math.isnan(cat):\n", " to_drop.append(i)\n", " user_data = user_data.drop(to_drop)\n", " user_data = user_data.reset_index()\n", " \n", " # Filters and drops users with no directory\n", " not_existing_names = []\n", " for i, user in enumerate(user_data[\"User\"]):\n", " if not os.path.isdir(user_dir + '/' + user):\n", " not_existing_names.append(i)\n", " user_data = user_data.drop(not_existing_names)\n", " user_data = user_data.reset_index()\n", " \n", " return user_data" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "tags": [] }, "outputs": [], "source": [ "user_data = strip_non_relevant(user_data)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "tags": [] }, "outputs": [], "source": [ "# Convert to awkward array\n", "array_user = ak.zip(dict(user_data))\n", "#array_user" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Import Eye Tracking data" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "tags": [] }, "outputs": [], "source": [ "# Creates dictionary with all the files for one user\n", "file_names = {}\n", "for user in user_data[\"User\"]:\n", " #print(user)\n", " available_files = []\n", " available_files_temp = os.listdir(user_dir + '/' + user)\n", " for file in available_files_temp:\n", " if \"graph01-ET_planning\" in file:\n", " available_files.append(file)\n", " # print(available_files)\n", " file_names[user] = available_files\n", "#file_names" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "16\n", "16\n" ] } ], "source": [ "# Read each CSV file for one user, stored for each attempt\n", "df_attempt1 = []\n", "df_attempt2 = []\n", "attempt2_mask = []\n", "for user in user_data['User']:\n", " files = file_names[user]\n", " if len(files) == 2:\n", " attempt2_mask.append(True)\n", " # attempt2_mask.append(False)\n", "\n", " df_attempt1.append(pd.read_csv(user_dir + '/' + user + '/' + files[0], delimiter=\"\t\", usecols=[\"eyeDataTimestamp\", \"gazePointAOI_target_x\", \"gazePointAOI_target_y\"]))\n", " df_attempt2.append(pd.read_csv(user_dir + '/' + user + '/' + files[1], delimiter=\"\t\", usecols=[\"eyeDataTimestamp\", \"gazePointAOI_target_x\", \"gazePointAOI_target_y\"]))\n", " elif len(files) == 1:\n", " attempt2_mask.append(False)\n", " df_attempt1.append(pd.read_csv(user_dir + '/' + user + '/' + files[0], delimiter=\"\t\", usecols=[\"eyeDataTimestamp\", \"gazePointAOI_target_x\", \"gazePointAOI_target_y\"]))\n", " df_attempt2.append(pd.read_csv(user_dir + '/' + user + '/' + files[0], delimiter=\"\t\", usecols=[\"eyeDataTimestamp\", \"gazePointAOI_target_x\", \"gazePointAOI_target_y\"]))\n", "print(len(df_attempt1))\n", "print(len(df_attempt2))\n", "#df_attempt1" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "tags": [] }, "outputs": [], "source": [ "# Add delta t list\n", "for attempt in [df_attempt1, df_attempt2]:\n", " for i in range(len(attempt)):\n", " temp_delta_t_list = []\n", " for j in range(len(attempt[i][\"eyeDataTimestamp\"]) - 1):\n", " temp_delta_t_list.append(attempt[i][\"eyeDataTimestamp\"][j+1] - attempt[i][\"eyeDataTimestamp\"][j])\n", " temp_delta_t_list.append(np.mean(temp_delta_t_list))\n", " attempt[i][\"deltaTimestamp\"] = temp_delta_t_list" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "tags": [] }, "outputs": [], "source": [ "# Convert df_attempts to ak.Array\n", "array_attempt1 = []\n", "array_attempt2 = []\n", "for df in df_attempt1:\n", " array_attempt1.append(ak.Array(dict(df)))\n", "for df in df_attempt2:\n", " array_attempt2.append(ak.Array(dict(df)))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Data processing" ] }, { "cell_type": "markdown", "metadata": { "tags": [] }, "source": [ "### Add Eye Tracking Data to user data" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "tags": [] }, "outputs": [], "source": [ "# Adds a list of arrays in a new column to an array\n", "def add_column(ak_array, arrays, col_name):\n", " combined_entries = [\n", " {**{k: ak_array[k][i] for k in ak_array.fields}, col_name: array} for i, (entry, array) in enumerate(zip(ak_array, arrays))\n", " ]\n", " return ak.Array(combined_entries)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "tags": [] }, "outputs": [], "source": [ "# Creates arrays with labels for all 3 categories\n", "def create_labels(array_user):\n", " labels_str = []\n", " labels_int_expert = []\n", " labels_int_good = []\n", " labels_int_bad = []\n", "\n", " for subject_name, pre_score, diff in zip(array_user[\"User\"], array_user[\"Pre score\"], array_user[\"Difference\"]):\n", " if pre_score == 2 and diff == 0:\n", " label_str = \"Expert\"\n", " label_int_expert = 1\n", " label_int_good = 0 \n", " label_int_bad = 0\n", " elif diff <= 0:\n", " label_str = \"Bad\"\n", " label_int_expert = 0\n", " label_int_good = 0 \n", " label_int_bad = 1\n", " else:\n", " label_str = \"Good\"\n", " label_int_expert = 0\n", " label_int_good = 1\n", " label_int_bad = 0\n", " labels_str.append(label_str)\n", " labels_int_expert.append(label_int_expert) \n", " labels_int_good.append(label_int_good) \n", " labels_int_bad.append(label_int_bad) \n", "\n", " labels_str = ak.Array(labels_str)\n", " labels_int_expert = ak.Array(labels_int_expert)\n", " labels_int_good = ak.Array(labels_int_good)\n", " labels_int_bad = ak.Array(labels_int_bad)\n", " \n", " return labels_str, labels_int_expert, labels_int_good, labels_int_bad" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "tags": [] }, "outputs": [], "source": [ "labels_str, labels_int_expert, labels_int_good, labels_int_bad = create_labels(array_user)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "tags": [] }, "outputs": [], "source": [ "# Creates array with first and second attempts added\n", "array_data = add_column(array_user, array_attempt1, 'Attempt1')\n", "array_data = add_column(array_data, array_attempt2, 'Attempt2')\n", "array_data[\"Attempt 2 Mask\"] = ak.Array(attempt2_mask)\n", "\n", "array_data['Labels Str'] = labels_str\n", "array_data['Labels Expert'] = labels_int_expert\n", "array_data['Labels Good'] = labels_int_good\n", "array_data['Labels Bad'] = labels_int_bad" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### AOIs" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "tags": [] }, "outputs": [], "source": [ "def minmax(data):\n", " \"\"\"Get the min and max of an iterable in O(n) time and constant space.\"\"\"\n", " minValue = data[0]\n", " maxValue = data[0]\n", " for d in data[1:]:\n", " minValue = d if d < minValue else minValue\n", " maxValue = d if d > maxValue else maxValue\n", " return (minValue,maxValue)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "tags": [] }, "outputs": [], "source": [ "# Get Range of field of view\n", "def get_minmax(array_data):\n", " min_max_x = []\n", " min_max_y = []\n", " for i, user in enumerate(array_data[\"User\"]):\n", " min_x, max_x = minmax(array_data[\"Attempt1\"][i][\"gazePointAOI_target_x\"])\n", " min_y, max_y = minmax(array_data[\"Attempt1\"][i][\"gazePointAOI_target_y\"])\n", " min_max_x.extend([min_x, max_x])\n", " min_max_y.extend([min_y, max_y])\n", "\n", " if array_data[\"Attempt 2 Mask\"][i]:\n", " min_x, max_x = minmax(array_data[\"Attempt2\"][i][\"gazePointAOI_target_x\"])\n", " min_y, max_y = minmax(array_data[\"Attempt2\"][i][\"gazePointAOI_target_y\"])\n", " min_max_x.extend([min_x, max_x])\n", " min_max_y.extend([min_y, max_y])\n", " min_x, max_x = minmax(min_max_x)\n", " min_y, max_y = minmax(min_max_y)\n", "\n", " return min_x, max_x, min_y, max_y" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "tags": [] }, "outputs": [], "source": [ "min_x, max_x, min_y, max_y = get_minmax(array_data)\n", "\n", "number_of_y_aios = 10\n", "number_of_x_aios = 15\n", "\n", "y_mesh = np.linspace(min_y, max_y, number_of_y_aios, endpoint=True)\n", "x_mesh = np.linspace(min_x, max_x, number_of_x_aios, endpoint=True)" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "tags": [] }, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "ef433922a3a546eb8237af4830c6acf1", "version_major": 2, "version_minor": 0 }, "image/png": "", "text/html": [ "\n", " <div style=\"display: inline-block;\">\n", " <div class=\"jupyter-widgets widget-label\" style=\"text-align: center;\">\n", " Figure\n", " </div>\n", " <img src='' width=640.0/>\n", " </div>\n", " " ], "text/plain": [ "Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "#%matplotlib widget\n", "\n", "fig, ax = plt.subplots()\n", "for i, user in enumerate(array_data[\"User\"]):\n", " ax.plot(array_data[\"Attempt1\"][i][\"gazePointAOI_target_x\"], array_data[\"Attempt1\"][i][\"gazePointAOI_target_y\"], label=user + \" A1\")\n", " if array_data[\"Attempt 2 Mask\"][i]:\n", " ax.plot(array_data[\"Attempt2\"][i][\"gazePointAOI_target_x\"], array_data[\"Attempt2\"][i][\"gazePointAOI_target_y\"], label=user + \" A2\")\n", "ax.set_yticks(y_mesh)\n", "ax.set_xticks(x_mesh)\n", "ax.yaxis.grid(True, which='major')\n", "ax.xaxis.grid(True, which='major')\n", "ax.set_ylabel(\"y position of gaze\")\n", "ax.set_xlabel(\"x position of gaze\")\n", "\n", "\n", "# plt.legend()\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "tags": [] }, "outputs": [], "source": [ "def get_aoi(x_coordinate, y_coordinate):\n", " \"\"\"given the x and y, find the aoi; return two indexes corresponding to the x and y position of the aoi\"\"\"\n", " x_index = np.argmin(np.abs(x_coordinate - x_mesh))\n", " if x_coordinate - x_mesh[x_index] > 0:\n", " x_index += 1\n", " y_index = np.argmin(np.abs(y_coordinate - y_mesh))\n", " if y_coordinate - y_mesh[y_index] > 0:\n", " y_index += 1\n", " return x_index, y_index" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Window Sliding Method" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "tags": [] }, "outputs": [], "source": [ "windows = []\n", "# windows_labels_int_expert = []\n", "# windows_labels_int_good = []\n", "# windows_labels_int_bad = []\n", "windows_labels_int = {\"Expert\": [], \"Good\": [], \"Bad\":[], \"Str\": []}\n", "windows_delta_t = []\n", "\n", "for i, user in enumerate(array_data[\"User\"]):\n", " for j in range(int((len(array_data[\"Attempt1\"][i][\"gazePointAOI_target_x\"])-21)/14)):\n", " try:\n", " windows.append({\"gazePointAOI_target_x\": array_data[\"Attempt1\"][i][\"gazePointAOI_target_x\"][j*14:(j*14 + 21)], \"gazePointAOI_target_y\": array_data[\"Attempt1\"][i][\"gazePointAOI_target_y\"][j*14:(j*14 + 21)]})\n", " # windows_labels_int_expert.append(array_data[\"Labels Expert\"][i])\n", " # windows_labels_int_good.append(array_data[\"Labels Good\"][i])\n", " # windows_labels_int_bad.append(array_data[\"Labels Bad\"][i])\n", " windows_labels_int[\"Expert\"].append(array_data[\"Labels Expert\"][i])\n", " windows_labels_int[\"Good\"].append(array_data[\"Labels Good\"][i])\n", " windows_labels_int[\"Bad\"].append(array_data[\"Labels Bad\"][i])\n", " windows_delta_t.append(array_data[\"Attempt1\"][i][\"deltaTimestamp\"][j*14:(j*14 + 21)])\n", " \n", " windows_labels_int[\"Str\"].append(array_data[\"Labels Str\"][i])\n", "\n", " except IndexError:\n", " windows.append({\"gazePointAOI_target_x\": array_data[\"Attempt1\"][i][\"gazePointAOI_target_x\"][-21:], \"gazePointAOI_target_y\": array_data[\"Attempt1\"][i][\"gazePointAOI_target_y\"][-21:]})\n", " # windows_labels_int_expert.append(array_data[\"Labels Expert\"][i])\n", " # windows_labels_int_good.append(array_data[\"Labels Good\"][i])\n", " # windows_labels_int_bad.append(array_data[\"Labels Bad\"][i])\n", " windows_labels_int[\"Expert\"].append(array_data[\"Labels Expert\"][i])\n", " windows_labels_int[\"Good\"].append(array_data[\"Labels Good\"][i])\n", " windows_labels_int[\"Bad\"].append(array_data[\"Labels Bad\"][i])\n", " windows_delta_t.append(array_data[\"Attempt1\"][i][\"deltaTimestamp\"][-21:])\n", " \n", " windows_labels_int[\"Str\"].append(array_data[\"Labels Str\"][i])\n", "\n", " if array_data[\"Attempt 2 Mask\"][i]:\n", " \n", " try:\n", " windows.append({\"gazePointAOI_target_x\": array_data[\"Attempt2\"][i][\"gazePointAOI_target_x\"][j*14:(j*14 + 21)], \"gazePointAOI_target_y\": array_data[\"Attempt2\"][i][\"gazePointAOI_target_y\"][j*14:(j*14 + 21)]})\n", " # windows_labels_int_expert.append(array_data[\"Labels Expert\"][i])\n", " # windows_labels_int_good.append(array_data[\"Labels Good\"][i])\n", " # windows_labels_int_bad.append(array_data[\"Labels Bad\"][i])\n", " windows_labels_int[\"Expert\"].append(array_data[\"Labels Expert\"][i])\n", " windows_labels_int[\"Good\"].append(array_data[\"Labels Good\"][i])\n", " windows_labels_int[\"Bad\"].append(array_data[\"Labels Bad\"][i])\n", " windows_delta_t.append(array_data[\"Attempt2\"][i][\"deltaTimestamp\"][j*14:(j*14 + 21)])\n", " \n", " windows_labels_int[\"Str\"].append(array_data[\"Labels Str\"][i])\n", "\n", " except IndexError:\n", " windows.append({\"gazePointAOI_target_x\": array_data[\"Attempt2\"][i][\"gazePointAOI_target_x\"][-21:], \"gazePointAOI_target_y\": array_data[\"Attempt2\"][i][\"gazePointAOI_target_y\"][-21:]})\n", " # windows_labels_int_expert.append(array_data[\"Labels Expert\"][i])\n", " # windows_labels_int_good.append(array_data[\"Labels Good\"][i])\n", " # windows_labels_int_bad.append(array_data[\"Labels Bad\"][i])\n", " windows_labels_int[\"Expert\"].append(array_data[\"Labels Expert\"][i])\n", " windows_labels_int[\"Good\"].append(array_data[\"Labels Good\"][i])\n", " windows_labels_int[\"Bad\"].append(array_data[\"Labels Bad\"][i])\n", " windows_delta_t.append(array_data[\"Attempt2\"][i][\"deltaTimestamp\"][-21:])\n", " \n", " windows_labels_int[\"Str\"].append(array_data[\"Labels Str\"][i])\n", " \n", " continue" ] }, { "cell_type": "code", "execution_count": 21, "metadata": { "tags": [] }, "outputs": [], "source": [ "windows_dict = {\"GazePoints\": ak.Array(windows), \"Labels\": ak.Array(windows_labels_int), \"DeltaTimestamps\": ak.Array(windows_delta_t)}\n", "array_windows = ak.Array(windows_dict)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Calculate Features" ] }, { "cell_type": "code", "execution_count": 22, "metadata": { "tags": [] }, "outputs": [], "source": [ "# add metrics to array_windows\n", "\n", "windowsAOI = []\n", "\n", "#array with coordinates of AOIs\n", "AOIs = [0] * 150\n", "i = 0\n", "j = 0\n", "for x in range(0,len(AOIs)):\n", "\n", " AOIs[x] = (int(j), int(i))\n", " i = i + 1\n", " if i == 10:\n", " i = 0\n", " j = j + 1\n", "\n", "for i, GazePoints in enumerate(array_windows[\"GazePoints\"]):\n", " #TotalAOIs = np.append(TotalAOIs, AOIs)\n", " GazePointsAOI = [0] * 150\n", " GazePointsAOIinstances = [0] * 150 \n", " DwellTime = [0] * 150\n", " averageTime = [0] * 150 #average duration on aoi\n", " averagePoints = [0] * 150 #average number of successive points on aoi\n", " standardDevTime = [0] * 150\n", " standardDevPoints = [0] * 150\n", " for j, xGaze in enumerate(array_windows[\"GazePoints\"][i][\"gazePointAOI_target_x\"]):\n", " aoi = get_aoi(array_windows[\"GazePoints\"][i][\"gazePointAOI_target_x\"][j], array_windows[\"GazePoints\"][i][\"gazePointAOI_target_y\"][j])\n", " x = AOIs.index(aoi)\n", " #calculating mean \n", " if j == 0:\n", " time = array_windows[\"DeltaTimestamps\"][i][j]\n", " points = 1\n", " k = x\n", " elif j == len(array_windows[\"GazePoints\"][i][\"gazePointAOI_target_x\"])-1:\n", " if aoi == last_aoi:\n", " time = time + array_windows[\"DeltaTimestamps\"][i][j]\n", " points = points + 1\n", " averageTime[x] = (GazePointsAOIinstances[x]*averageTime[x]+time)/(GazePointsAOIinstances[x]+1)\n", " averagePoints[x] = (GazePointsAOIinstances[x]*averagePoints[x]+points)/(GazePointsAOIinstances[x]+1)\n", " GazePointsAOIinstances[x] = GazePointsAOIinstances[x] + 1\n", " else:\n", " \n", " averageTime[k] = (GazePointsAOIinstances[k]*averageTime[k]+time)/(GazePointsAOIinstances[k]+1)\n", " averagePoints[k] = (GazePointsAOIinstances[k]*averagePoints[k]+points)/(GazePointsAOIinstances[k]+1)\n", " GazePointsAOIinstances[k] = GazePointsAOIinstances[k] + 1\n", " time = array_windows[\"DeltaTimestamps\"][i][j]\n", " points = 1\n", " averageTime[x] = (GazePointsAOIinstances[x]*averageTime[x]+time)/(GazePointsAOIinstances[x]+1)\n", " averagePoints[x] = (GazePointsAOIinstances[x]*averagePoints[x]+points)/(GazePointsAOIinstances[x]+1)\n", " GazePointsAOIinstances[x] = GazePointsAOIinstances[x] + 1 \n", " \n", " else: #if index not first and not last: check, if aoi is same as last: if true add together \n", " if aoi == last_aoi:\n", " time = time + array_windows[\"DeltaTimestamps\"][i][j]\n", " points = points + 1\n", " k = x\n", " else: #calculate an incremental mean, by multiplying the previous mean with number of instances and adding new value\n", " \n", " averageTime[k] = (GazePointsAOIinstances[k]*averageTime[k]+time)/(GazePointsAOIinstances[k]+1)\n", " averagePoints[k] = (GazePointsAOIinstances[k]*averagePoints[k]+points)/(GazePointsAOIinstances[k]+1)\n", " GazePointsAOIinstances[k] = GazePointsAOIinstances[k] + 1\n", " time = array_windows[\"DeltaTimestamps\"][i][j]\n", " points = 1\n", " k = x\n", " \n", " \n", " \n", " DwellTime[x] = DwellTime[x] + array_windows[\"DeltaTimestamps\"][i][j]\n", " GazePointsAOI[x] = GazePointsAOI[x] + 1\n", " \n", " last_aoi = aoi\n", "\n", " \n", "#calculate standard deviations \n", " for s, xGaze in enumerate(array_windows[\"GazePoints\"][i][\"gazePointAOI_target_x\"]):\n", " aoi = get_aoi(array_windows[\"GazePoints\"][i][\"gazePointAOI_target_x\"][s], array_windows[\"GazePoints\"][i][\"gazePointAOI_target_y\"][s])\n", " x = AOIs.index(aoi)\n", " if s == 0:\n", " time = array_windows[\"DeltaTimestamps\"][i][s]\n", " points = 1\n", " k = x\n", " elif s == len(array_windows[\"GazePoints\"][i][\"gazePointAOI_target_x\"])-1:\n", " if aoi == latest_aoi:\n", " time = time + array_windows[\"DeltaTimestamps\"][i][s]\n", " points = points + 1\n", " standardDevTime[x] = standardDevTime[x] + (averageTime[x] - time)**2 \n", " standardDevPoints[x] = standardDevPoints[x] + (averagePoints[x] - points)**2 \n", " else:\n", " \n", " standardDevTime[k] = standardDevTime[k] + (averageTime[k] - time)**2 \n", " standardDevPoints[k] = standardDevPoints[k] + (averagePoints[k] - points)**2\n", " time = array_windows[\"DeltaTimestamps\"][i][s]\n", " points = 1\n", " standardDevTime[x] = standardDevTime[x] + (averageTime[x] - time)**2 \n", " standardDevPoints[x] = standardDevPoints[x] + (averagePoints[x] - points)**2\n", " \n", " \n", " else: #if index not first and not last: check, if aoi is same as last: if true add together \n", " if aoi == latest_aoi:\n", " time = time + array_windows[\"DeltaTimestamps\"][i][s]\n", " points = points + 1\n", " k = x\n", " else: #calculate an incremental mean, by multiplying the previous mean with number of instances and adding new value\n", " \n", " standardDevTime[k] = standardDevTime[k] + (averageTime[k] - time)**2 \n", " standardDevPoints[k] = standardDevPoints[k] + (averagePoints[k] - points)**2\n", " time = array_windows[\"DeltaTimestamps\"][i][s]\n", " points = 1\n", " k = x\n", " \n", " latest_aoi = aoi\n", " \n", " for l in range(0,len(standardDevTime)):\n", " if GazePointsAOIinstances[l] != 0:\n", " standardDevTime[l] = math.sqrt(standardDevTime[l])/GazePointsAOIinstances[l]\n", " standardDevPoints[l] = math.sqrt(standardDevPoints[l])/GazePointsAOIinstances[l]\n", " \n", " \n", "\n", " \n", " \n", " AOI_dict = {\"AOI\": AOIs, \"TotalDwellTime\": DwellTime, \"GazePointsAOI\": GazePointsAOI,\n", " \"Instances\": GazePointsAOIinstances, \"AvTime\": averageTime, \"AvPoints\": averagePoints,\n", " \"StDtime\": standardDevTime, \"StDpoints\": standardDevPoints}\n", " #AOIS = ak.Array(AOI_dict)\n", " windowsAOI.append(AOI_dict)" ] }, { "cell_type": "code", "execution_count": 23, "metadata": { "tags": [] }, "outputs": [], "source": [ "#windows_dict_feat = {\"GazePoints\": ak.Array(windows), \"Labels\": ak.Array(windows_labels_int), \"DeltaTimestamps\": ak.Array(windows_delta_t), \"features\": ak.Array(windowsAOI)}\n", "#array_windows_feat = ak.Array(windows_dict_feat)\n", "array_features = ak.from_iter(windowsAOI)\n", "array_windows_feat = add_column(array_windows, array_features, 'Features')" ] }, { "cell_type": "code", "execution_count": 24, "metadata": { "tags": [] }, "outputs": [], "source": [ "array_windows_data = array_windows_feat" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Support Vector Machine" ] }, { "cell_type": "code", "execution_count": 25, "metadata": { "tags": [] }, "outputs": [], "source": [ "fields_wo_aoi = []\n", "for field in array_windows_data['Features'].fields:\n", " if field != 'AOI':\n", " fields_wo_aoi.append(field)" ] }, { "cell_type": "code", "execution_count": 26, "metadata": { "tags": [] }, "outputs": [ { "data": { "text/plain": [ "['TotalDwellTime',\n", " 'GazePointsAOI',\n", " 'Instances',\n", " 'AvTime',\n", " 'AvPoints',\n", " 'StDtime',\n", " 'StDpoints']" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "fields_wo_aoi" ] }, { "cell_type": "code", "execution_count": 27, "metadata": { "tags": [] }, "outputs": [], "source": [ "x_data = array_windows_data['Features'][fields_wo_aoi]\n", "X_data = []\n", "for array in x_data:\n", " temp = []\n", " for field in fields_wo_aoi:\n", " temp.extend(array[field].to_numpy())\n", " X_data.append(temp)\n", "X_data = np.array(X_data)" ] }, { "cell_type": "code", "execution_count": 28, "metadata": { "tags": [] }, "outputs": [], "source": [ "from sklearn.preprocessing import MinMaxScaler\n", "\n", "scaling = MinMaxScaler(feature_range=(-1,1)).fit(X_data)\n", "X_data = scaling.transform(X_data)" ] }, { "cell_type": "code", "execution_count": 29, "metadata": { "tags": [] }, "outputs": [], "source": [ "y_bad = array_windows_data[\"Labels\"][\"Bad\"]\n", "y_good = array_windows_data[\"Labels\"][\"Good\"]\n", "y_expert = array_windows_data[\"Labels\"][\"Expert\"]\n", "y_dict = {\"Bad\": y_bad, \"Good\": y_good, \"Expert\": y_expert}\n", "\n", "y_good = (y_good - 1)* (-1)\n", "\n", "\n", "clf_bad = SVC(probability=True)\n", "clf_good = SVC(probability=True)\n", "clf_expert = SVC(probability=True)\n", "svm_dict = {\"Bad\": clf_bad, \"Good\": clf_good, \"Expert\": clf_expert}\n", "\n", "'''\n", "clf_bad.fit(X_data, y_bad)\n", "clf_good .fit(X_data, y_good)\n", "clf_expert.fit(X_data, y_expert)\n", "'''\n", "#y_bad = y_good\n", "y_bad = y_expert" ] }, { "cell_type": "code", "execution_count": 30, "metadata": { "tags": [] }, "outputs": [], "source": [ "import pickle\n", "\n", "save_dict = y_dict\n", "save_dict[\"Data\"] = X_data\n", "\n", "with open('data/data_and_labels_dict.pkl', 'wb') as f:\n", " pickle.dump(save_dict, f)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Metrics" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Accuracies" ] }, { "cell_type": "code", "execution_count": 31, "metadata": { "tags": [] }, "outputs": [], "source": [ "# Get the accuracy of a classifier\n", "def get_accuracy(clf, X, labels):\n", " return np.sum([pred == label for pred, label in zip(clf.predict(X), labels)])/len(labels)" ] }, { "cell_type": "code", "execution_count": 32, "metadata": { "tags": [] }, "outputs": [], "source": [ "# Get the accuracy of a classifier with >p probability\n", "def get_accuracy_prob(clf, X, labels, p=0.5):\n", " return np.sum([pred1 >= p for (pred0, pred1), label in zip(clf.predict_proba(X), labels)])/len(labels)" ] }, { "cell_type": "code", "execution_count": 33, "metadata": { "tags": [] }, "outputs": [ { "data": { "text/plain": [ "'\\nacc_dict = {}\\nfor name, svmachine in svm_dict.items():\\n acc_dict[name] = get_accuracy(svmachine, X_data, y_dict[name])\\nacc_dict\\n'" ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "'''\n", "acc_dict = {}\n", "for name, svmachine in svm_dict.items():\n", " acc_dict[name] = get_accuracy(svmachine, X_data, y_dict[name])\n", "acc_dict\n", "'''" ] }, { "cell_type": "code", "execution_count": 34, "metadata": { "tags": [] }, "outputs": [ { "data": { "text/plain": [ "'\\nacc_dict_prob = {}\\nfor name, svmachine in svm_dict.items():\\n acc_dict_prob[name] = get_accuracy_prob(svmachine, X_data, y_dict[name], p=0.5)\\nacc_dict_prob\\n'" ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "'''\n", "acc_dict_prob = {}\n", "for name, svmachine in svm_dict.items():\n", " acc_dict_prob[name] = get_accuracy_prob(svmachine, X_data, y_dict[name], p=0.5)\n", "acc_dict_prob\n", "'''" ] }, { "cell_type": "code", "execution_count": 35, "metadata": { "tags": [] }, "outputs": [], "source": [ "#np.sum([clf_good.predict_proba(X_data)[:, 1]>=0.5])/len(clf_good.predict_proba(X_data)[:, 0])" ] }, { "cell_type": "code", "execution_count": 36, "metadata": { "tags": [] }, "outputs": [ { "data": { "text/plain": [ "'\\npredictions = []\\n\\nfor name, svmachine in svm_dict.items():\\n predictions.append(svmachine.predict_proba(X_data))\\npredictions\\n'" ] }, "execution_count": 36, "metadata": {}, "output_type": "execute_result" } ], "source": [ "'''\n", "predictions = []\n", "\n", "for name, svmachine in svm_dict.items():\n", " predictions.append(svmachine.predict_proba(X_data))\n", "predictions\n", "'''" ] }, { "cell_type": "code", "execution_count": 37, "metadata": { "tags": [] }, "outputs": [ { "data": { "text/plain": [ "'\\npredictions_str = []\\npossible_outcome = [\"Bad\", \"Good\", \"Expert\"]\\nfor a, b, c in zip(predictions[0][:, 1], predictions[1][:, 1], predictions[2][:, 1]):\\n list = [a, b, c]\\n index = np.argmax(list)\\n predictions_str.append(possible_outcome[index])\\n'" ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "'''\n", "predictions_str = []\n", "possible_outcome = [\"Bad\", \"Good\", \"Expert\"]\n", "for a, b, c in zip(predictions[0][:, 1], predictions[1][:, 1], predictions[2][:, 1]):\n", " list = [a, b, c]\n", " index = np.argmax(list)\n", " predictions_str.append(possible_outcome[index])\n", "'''" ] }, { "cell_type": "code", "execution_count": 38, "metadata": { "tags": [] }, "outputs": [ { "data": { "text/plain": [ "'\\nis_correct_list = [pred == label for pred, label in zip(predictions_str, array_windows_data[\"Labels\"][\"Str\"])]\\nacc_total = np.sum(is_correct_list)/len(is_correct_list)\\nacc_total\\n'" ] }, "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ "'''\n", "is_correct_list = [pred == label for pred, label in zip(predictions_str, array_windows_data[\"Labels\"][\"Str\"])]\n", "acc_total = np.sum(is_correct_list)/len(is_correct_list)\n", "acc_total\n", "'''" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Recall and Matthews Correlation Coefficient" ] }, { "cell_type": "code", "execution_count": 39, "metadata": { "tags": [] }, "outputs": [], "source": [ "# Calculates the recall from given clf and data\n", "def get_recall(clf, X, labels):\n", " return np.sum([pred == label and label == 1 for pred, label in zip(clf.predict(X), labels)])/np.sum(labels)" ] }, { "cell_type": "code", "execution_count": 40, "metadata": { "tags": [] }, "outputs": [], "source": [ "# Calculates the Matthews Correlation Coefficient from given clf and data\n", "def get_MCC(clf, X, labels):\n", " tp = np.sum([pred == label and label == 1 for pred, label in zip(clf.predict(X), labels)])\n", " tn = np.sum([pred == label and label == 0 for pred, label in zip(clf.predict(X), labels)])\n", " fp = np.sum([pred != label and label == 0 for pred, label in zip(clf.predict(X), labels)])\n", " fn = np.sum([pred != label and label == 1 for pred, label in zip(clf.predict(X), labels)])\n", " return (tp*tn-fp*fn)/np.sqrt((tp+fp)*(tp+fn)*(tn+fp)*(tn+fn))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### F1 Score" ] }, { "cell_type": "code", "execution_count": 41, "metadata": { "tags": [] }, "outputs": [], "source": [ "# Calculates the F1 Score from given clf and data\n", "def get_F1Score(clf, X, labels):\n", " tp = np.sum([pred == label and label == 1 for pred, label in zip(clf.predict(X), labels)])\n", " fp = np.sum([pred != label and label == 0 for pred, label in zip(clf.predict(X), labels)])\n", " fn = np.sum([pred != label and label == 1 for pred, label in zip(clf.predict(X), labels)])\n", " #print([tp, fp, fn])\n", " return 2*tp/(2*tp+fp+fn)" ] }, { "cell_type": "markdown", "metadata": { "tags": [] }, "source": [ "## ML Pipeline Test" ] }, { "cell_type": "code", "execution_count": 42, "metadata": { "tags": [] }, "outputs": [], "source": [ "# Gets all subsets of a list\n", "def get_subsets(fullset):\n", " listrep = fullset\n", " n = len(listrep)\n", " return [[listrep[k] for k in range(n) if i & 1 << k] for i in range(2 ** n)][1:]\n", "\n", "\n", "string = [\"x\", \"y\", \"z\"]\n", "# print(get_subsets(string))" ] }, { "cell_type": "code", "execution_count": 43, "metadata": { "tags": [] }, "outputs": [], "source": [ "feature_dict = {'Features': get_subsets(fields_wo_aoi)}\n", "# print(len(feature_dict['Features']))\n", "#feature_dict['Features']" ] }, { "cell_type": "code", "execution_count": 44, "metadata": { "tags": [] }, "outputs": [], "source": [ "inner_StratKFold = StratifiedKFold(n_splits=5)\n", "X_bad_inner_train = inner_StratKFold.split(X_data, y_bad)" ] }, { "cell_type": "code", "execution_count": 45, "metadata": { "tags": [] }, "outputs": [], "source": [ "# for data in X_bad_inner_train:\n", "# print(len(data[0]), len(data[1]))\n", "# print(f\"Training:\\nLabels: {len(y_bad[data[0]])}\\nX: {X_data[data[0]].shape}\")\n", "# print(f\"Test:\\nLabels: {len(y_bad[data[1]])}\\nX: {X_data[data[1]].shape}\")" ] }, { "cell_type": "code", "execution_count": 46, "metadata": { "tags": [] }, "outputs": [], "source": [ "svm_model = SVC()\n", "#GS = GridSearchCV(svm_model, param_grid)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Nested Cross Validation" ] }, { "cell_type": "code", "execution_count": 47, "metadata": { "tags": [] }, "outputs": [], "source": [ "# Define the grid of hyperparameters to search\n", "param_grid = {\n", " 'C': [0.1, 1, 10, 100], # Regularization parameter\n", " 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], # Type of kernel function\n", " 'gamma': ['auto', 'scale'] # Kernel coefficient\n", "}" ] }, { "cell_type": "code", "execution_count": 48, "metadata": { "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(1521, 417) 1521\n" ] } ], "source": [ "#remove all aois that are zero for all windows\n", "idx = np.argwhere(np.all(X_data[..., :] == -1, axis=0))\n", "reduced_X_data = np.delete(X_data, idx, axis=1)\n", "print(reduced_X_data.shape, len(y_bad))" ] }, { "cell_type": "markdown", "metadata": { "jp-MarkdownHeadingCollapsed": true, "tags": [] }, "source": [ "### Without StratifiedKFold" ] }, { "cell_type": "code", "execution_count": 49, "metadata": { "tags": [] }, "outputs": [ { "data": { "text/plain": [ "'\\n#shorten data set for rnCV\\n#labels have been sorted, to get classes one and zero take 30 from start and 30 from end\\nreduced_X_data1 = reduced_X_data[:30,:]\\nreduced_X_data2 = reduced_X_data[-30:,:]\\nreduced_X_data = np.concatenate((reduced_X_data1,reduced_X_data2),axis=0)\\ny_bad1 = y_bad[:30]\\ny_bad2 = y_bad[-30:]\\ny_bad = np.concatenate((y_bad1,y_bad2),axis=0)\\nprint(np.shape(reduced_X_data), np.shape(y_bad))\\n'" ] }, "execution_count": 49, "metadata": {}, "output_type": "execute_result" } ], "source": [ "'''\n", "#shorten data set for rnCV\n", "#labels have been sorted, to get classes one and zero take 30 from start and 30 from end\n", "reduced_X_data1 = reduced_X_data[:30,:]\n", "reduced_X_data2 = reduced_X_data[-30:,:]\n", "reduced_X_data = np.concatenate((reduced_X_data1,reduced_X_data2),axis=0)\n", "y_bad1 = y_bad[:30]\n", "y_bad2 = y_bad[-30:]\n", "y_bad = np.concatenate((y_bad1,y_bad2),axis=0)\n", "print(np.shape(reduced_X_data), np.shape(y_bad))\n", "'''" ] }, { "cell_type": "code", "execution_count": 50, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'\\n X_data_test = reduced_X_data[(i+1)*int((total/10)):,:] \\n X_data2 = reduced_X_data[:i*int((total/10)),:]\\n #print(np.shape(X_data1), np.shape(X_data2))\\n X_split = np.concatenate((X_data1,X_data2),axis=0) \\n y_bad1 = y_bad[(i+1)*int((total/10)):] \\n y_bad2 = y_bad[:i*int((total/10))]\\n y_bad_split = np.concatenate((y_bad1,y_bad2),axis=0)\\n\\n\\n X_test_data = reduced_X_data[i*int((total/10)):(i+1)*int((total/10)),:]\\n y_bad_test_data = y_bad[i*int((total/10)):(i+1)*int((total/10))]\\n #print(np.shape(X_split), np.shape(X_test_data))\\n\\n\\n X_train, X_test, y_train_bad, y_test_bad = model_selection.train_test_split(X_split, y_bad_split, test_size=0.3, random_state=seed)\\n #print(np.shape(X_train), np.shape(y_train_bad))\\n \\n '" ] }, "execution_count": 50, "metadata": {}, "output_type": "execute_result" } ], "source": [ "'''\n", " X_data_test = reduced_X_data[(i+1)*int((total/10)):,:] \n", " X_data2 = reduced_X_data[:i*int((total/10)),:]\n", " #print(np.shape(X_data1), np.shape(X_data2))\n", " X_split = np.concatenate((X_data1,X_data2),axis=0) \n", " y_bad1 = y_bad[(i+1)*int((total/10)):] \n", " y_bad2 = y_bad[:i*int((total/10))]\n", " y_bad_split = np.concatenate((y_bad1,y_bad2),axis=0)\n", "\n", "\n", " X_test_data = reduced_X_data[i*int((total/10)):(i+1)*int((total/10)),:]\n", " y_bad_test_data = y_bad[i*int((total/10)):(i+1)*int((total/10))]\n", " #print(np.shape(X_split), np.shape(X_test_data))\n", "\n", "\n", " X_train, X_test, y_train_bad, y_test_bad = model_selection.train_test_split(X_split, y_bad_split, test_size=0.3, random_state=seed)\n", " #print(np.shape(X_train), np.shape(y_train_bad))\n", " \n", " '''" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### With StratifiedKFold" ] }, { "cell_type": "code", "execution_count": 51, "metadata": { "tags": [] }, "outputs": [], "source": [ "def get_x_data_made_of_specific_feats(desired_feats:list):\n", " x_data = array_windows_data['Features'][desired_feats]\n", " X_data = []\n", " for array in x_data:\n", " temp = []\n", " for feature in desired_feats:\n", " temp.extend(array[feature].to_numpy())\n", " X_data.append(temp)\n", " X_data = np.array(X_data)\n", "\n", " scaling = MinMaxScaler(feature_range=(-1,1)).fit(X_data)\n", " X_data = scaling.transform(X_data)\n", " \n", " idx = np.argwhere(np.all(X_data[..., :] == -1, axis=0))\n", " X_data = np.delete(X_data, idx, axis=1)\n", " return X_data\n", "\n", "# Function to get accuracy\n", "def get_accuracy(model, X_test, y_test):\n", " return model.score(X_test, y_test)" ] }, { "cell_type": "code", "execution_count": 52, "metadata": { "tags": [] }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 5/5 [03:55<00:00, 47.17s/it]\n" ] } ], "source": [ "from sklearn.model_selection import StratifiedKFold, GridSearchCV\n", "from sklearn.svm import SVC\n", "import numpy as np\n", "from tqdm import tqdm\n", "\n", "# Define parameters\n", "#param_grid = {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf'], 'gamma': ['scale', 'auto']}\n", "seeds = [1, 3, 7, 9, 42]\n", "output_list = [] \n", "# Function to get accuracy\n", "def get_accuracy(model, X_test, y_test):\n", " return model.score(X_test, y_test)\n", "\n", "reduced_X_data = get_x_data_made_of_specific_feats(fields_wo_aoi)\n", "\n", "# Initialize results storage\n", "seedav = []\n", "\n", "# Outer loop: over different seeds\n", "output_list.append(f\"Performing repeated nested cross validation with a data set of length {len(reduced_X_data)}\")\n", "for seed in tqdm(seeds, position=0, leave=True):\n", " # Initialize outer cross-validation\n", " outer_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)\n", " accav = []\n", "\n", " # Split data into training and test sets\n", " counter_split = 0\n", " for train_index, test_index in outer_cv.split(reduced_X_data, y_bad):\n", " X_train, X_test = reduced_X_data[train_index], reduced_X_data[test_index]\n", " y_train_bad, y_test_bad = y_bad[train_index], y_bad[test_index]\n", "\n", " # Optionally reduce dataset size for training and testing\n", " # Set cutoff to -1 to use the full dataset, or set to a positive number to limit the size\n", " cutoff = -1\n", " if cutoff > 0:\n", " idx_list_train = np.random.permutation(len(y_train_bad))[:cutoff]\n", " cut_X_train, cut_y_train_bad = X_train[idx_list_train], y_train_bad[idx_list_train]\n", "\n", " idx_list_test = np.random.permutation(len(y_test_bad))[:cutoff]\n", " cut_X_test, cut_y_test_bad = X_test[idx_list_test], y_test_bad[idx_list_test]\n", " else:\n", " cut_X_train, cut_y_train_bad = X_train, y_train_bad\n", " cut_X_test, cut_y_test_bad = X_test, y_test_bad\n", "\n", " # Define the SVM classifier and GridSearch\n", " SVM = SVC()\n", " inner_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)\n", " grid_search = GridSearchCV(estimator=SVM, param_grid=param_grid, cv=inner_cv, scoring='accuracy', n_jobs=-1, verbose=0)\n", "\n", " # Perform grid search on reduced training set\n", " grid_search.fit(cut_X_train, cut_y_train_bad)\n", "\n", " # Best model from grid search\n", " best_model = grid_search.best_estimator_\n", " test_accuracy = best_model.score(cut_X_test, cut_y_test_bad)\n", " output_list.append(f\"Seed {seed} - Split Variant {counter_split} - Test m-Accuracy: {test_accuracy}\")\n", "\n", " # Train best model on full training set\n", " best_params = grid_search.best_params_\n", " best_clf = SVC(C=best_params[\"C\"], kernel=best_params[\"kernel\"], gamma=best_params[\"gamma\"])\n", " best_clf.fit(X_train, y_train_bad)\n", "\n", " # Evaluate on full test set\n", " acc = get_accuracy(best_clf, X_test, y_test_bad)\n", " output_list.append(f\"Seed {seed} - Split Variant {counter_split} - n-Accuracy: {acc}\")\n", " accav.append(acc)\n", " counter_split += 1\n", "\n", " # Average accuracy for the current seed\n", " meanacc = np.mean(accav)\n", " seedav.append(meanacc)\n", "\n", "# Final average accuracy across all seeds\n", "rnCV = np.mean(seedav)\n", "output_list.append(f\"Repeated Nested Cross-Validation Accuracy: {rnCV}\")\n", "\n" ] }, { "cell_type": "code", "execution_count": 53, "metadata": { "tags": [] }, "outputs": [], "source": [ "np.savetxt(\"Output RNCV.txt\", output_list, fmt=\"%s\") " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Feature Selection" ] }, { "cell_type": "code", "execution_count": 54, "metadata": { "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "F1 score for field TotalDwellTime: 0.0\n", "F1 score for field GazePointsAOI: 0.0\n", "F1 score for field Instances: 0.10256410256410256\n", "F1 score for field AvTime: 0.0\n", "F1 score for field AvPoints: 0.0\n", "F1 score for field StDtime: 0.0\n", "F1 score for field StDpoints: 0.0\n", "Selected feature: Instances with index 2\n", "F1 score for field TotalDwellTime: 0.22727272727272727\n", "F1 score for field GazePointsAOI: 0.22727272727272727\n", "F1 score for field AvTime: 0.2727272727272727\n", "F1 score for field AvPoints: 0.2727272727272727\n", "F1 score for field StDtime: 0.10526315789473684\n", "F1 score for field StDpoints: 0.05405405405405406\n", "Selected feature: AvTime with index 2\n", "F1 score for field TotalDwellTime: 0.2608695652173913\n", "F1 score for field GazePointsAOI: 0.2608695652173913\n", "F1 score for field AvPoints: 0.2608695652173913\n", "F1 score for field StDtime: 0.27906976744186046\n", "F1 score for field StDpoints: 0.23809523809523808\n", "Selected feature: StDtime with index 3\n", "F1 score for field TotalDwellTime: 0.3111111111111111\n", "F1 score for field GazePointsAOI: 0.3111111111111111\n", "F1 score for field AvPoints: 0.2608695652173913\n", "F1 score for field StDpoints: 0.24390243902439024\n", "Selected feature: TotalDwellTime with index 0\n", "F1 score for field GazePointsAOI: 0.2978723404255319\n", "F1 score for field AvPoints: 0.2978723404255319\n", "F1 score for field StDpoints: 0.2727272727272727\n", "Selected feature: GazePointsAOI with index 0\n", "F1 score for field AvPoints: 0.30434782608695654\n", "F1 score for field StDpoints: 0.2727272727272727\n", "Selected feature: AvPoints with index 0\n", "F1 score for field StDpoints: 0.26666666666666666\n", "Selected feature: StDpoints with index 0\n" ] } ], "source": [ "#create feature hirarchy by adding one feature and then\n", "#selecting the one which gives the highest F1 score\n", "\n", "remaining_feats = fields_wo_aoi\n", "picked_feats = []\n", "\n", "while len(remaining_feats) != 0: #while still elements in remaining_feats\n", " F1s = []\n", " for field in remaining_feats:\n", " current_feats = np.append(picked_feats, field)\n", " x_data = array_windows_data['Features'][current_feats]\n", " X_data = []\n", " for array in x_data:\n", " temp = []\n", " for feature in current_feats:\n", " temp.extend(array[feature].to_numpy())\n", " X_data.append(temp)\n", " X_data = np.array(X_data)\n", " \n", " scaling = MinMaxScaler(feature_range=(-1,1)).fit(X_data)\n", " X_data = scaling.transform(X_data)\n", " \n", " idx = np.argwhere(np.all(X_data[..., :] == -1, axis=0))\n", " X_data = np.delete(X_data, idx, axis=1)\n", "\n", " X_train, X_test, y_train_bad, y_test_bad = train_test_split(X_data, y_bad, test_size=0.2, random_state=42)\n", " \n", "\n", " # Define the SVM classifier\n", " best_clf = SVC(C = best_params[\"C\"], kernel = best_params[\"kernel\"], gamma = best_params[\"gamma\"])\n", " best_clf.fit(X_train, y_train_bad)\n", "\n", " F1 = get_F1Score(best_clf, X_test, y_test_bad)\n", " print(f\"F1 score for field {field}: {F1}\")\n", " F1s = np.append(F1s, F1)\n", " idx = np.argmax(F1s) #get index of feature with highest F1 score\n", " print(f\"Selected feature: {remaining_feats[idx]} with index {idx}\")\n", " picked_feats = np.append(picked_feats, remaining_feats[idx])\n", " remaining_feats = np.delete(remaining_feats,idx)" ] }, { "cell_type": "code", "execution_count": 55, "metadata": { "tags": [] }, "outputs": [ { "data": { "text/plain": [ "array(['Instances', 'AvTime', 'StDtime', 'TotalDwellTime',\n", " 'GazePointsAOI', 'AvPoints', 'StDpoints'], dtype='<U32')" ] }, "execution_count": 55, "metadata": {}, "output_type": "execute_result" } ], "source": [ "picked_feats" ] }, { "cell_type": "code", "execution_count": 63, "metadata": { "tags": [] }, "outputs": [], "source": [ "picked_feats_record = ['Instances', 'AvTime', 'StDtime', 'TotalDwellTime', 'GazePointsAOI', 'AvPoints', 'StDpoints']\n", "f1_score_feature_selection = [0.10256410256410256, 0.2727272727272727, 0.27906976744186046, 0.3111111111111111, 0.2978723404255319, 0.30434782608695654, 0.26666666666666666]" ] }, { "cell_type": "code", "execution_count": 64, "metadata": { "tags": [] }, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "347880ccda464005ba81d8a78877f754", "version_major": 2, "version_minor": 0 }, "image/png": "", "text/html": [ "\n", " <div style=\"display: inline-block;\">\n", " <div class=\"jupyter-widgets widget-label\" style=\"text-align: center;\">\n", " Figure\n", " </div>\n", " <img src='' width=640.0/>\n", " </div>\n", " " ], "text/plain": [ "Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import matplotlib.pyplot as plt\n", "plt.figure()\n", "plt.plot(range(len(picked_feats_record)), f1_score_feature_selection)\n", "plt.xticks(range(len(picked_feats_record)), picked_feats_record, rotation='vertical')\n", "plt.xlabel(\"Features to be added from left to right\")\n", "plt.ylabel(\"F1 Score\")\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Permutation Test" ] }, { "cell_type": "code", "execution_count": 65, "metadata": { "tags": [] }, "outputs": [], "source": [ "# Permutate labeled data\n", "def permutate(labels):\n", " '''\n", " labels: labels in a {0, 1} structure\n", " return: labels shuffled\n", " '''\n", " shuffled_labels = np.copy(labels)\n", " np.random.shuffle(shuffled_labels)\n", " return shuffled_labels" ] }, { "cell_type": "code", "execution_count": 66, "metadata": { "tags": [] }, "outputs": [], "source": [ "# Creating SVM based on Feature selection optimum\n", "def get_best_clf_from_specific_feats(feats_of_interest):\n", " reduced_X_data = get_x_data_made_of_specific_feats(feats_of_interest)\n", " print(f\"Searching for best clf in {param_grid} with {len(reduced_X_data)} datapoints with following features: {feats_of_interest}\")\n", "\n", " seed = 7\n", "\n", " outer_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)\n", " accav = []\n", "\n", " # Split data into training and test sets\n", " counter_split = 0\n", " for train_index, test_index in outer_cv.split(reduced_X_data, y_bad):\n", " X_train, X_test = reduced_X_data[train_index], reduced_X_data[test_index]\n", " y_train_bad, y_test_bad = y_bad[train_index], y_bad[test_index]\n", "\n", " # Optionally reduce dataset size for training and testing\n", " # Set cutoff to -1 to use the full dataset, or set to a positive number to limit the size\n", " cutoff = -1\n", " if cutoff > 0:\n", " idx_list_train = np.random.permutation(len(y_train_bad))[:cutoff]\n", " cut_X_train, cut_y_train_bad = X_train[idx_list_train], y_train_bad[idx_list_train]\n", "\n", " idx_list_test = np.random.permutation(len(y_test_bad))[:cutoff]\n", " cut_X_test, cut_y_test_bad = X_test[idx_list_test], y_test_bad[idx_list_test]\n", " else:\n", " cut_X_train, cut_y_train_bad = X_train, y_train_bad\n", " cut_X_test, cut_y_test_bad = X_test, y_test_bad\n", "\n", " # Define the SVM classifier and GridSearch\n", " SVM = SVC()\n", " inner_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)\n", " grid_search = GridSearchCV(estimator=SVM, param_grid=param_grid, cv=inner_cv, scoring='accuracy', n_jobs=-1, verbose=1)\n", "\n", " # Perform grid search on reduced training set\n", " print(\"Performing Grid Search...\")\n", " grid_search.fit(cut_X_train, cut_y_train_bad)\n", "\n", " # Best model from grid search\n", " best_model = grid_search.best_estimator_\n", " test_accuracy = best_model.score(cut_X_test, cut_y_test_bad)\n", " print(f\"Seed {seed} - Split Variant {counter_split} - Test m-Accuracy: {test_accuracy}\")\n", "\n", " # Train best model on full training set\n", " best_params = grid_search.best_params_\n", " best_clf = SVC(C=best_params[\"C\"], kernel=best_params[\"kernel\"], gamma=best_params[\"gamma\"])\n", " print(f\"Performing fit of best model with params: C - {best_params['C']}; Kernel - {best_params['kernel']}; Gamma - {best_params['gamma']} on full data set...\")\n", " best_clf.fit(X_train, y_train_bad)\n", "\n", " # Evaluate on full test set\n", " acc = get_accuracy(best_clf, X_test, y_test_bad)\n", " print(f\"Seed {seed} - Split Variant {counter_split} - n-Accuracy: {acc}\")\n", " accav.append(acc)\n", " counter_split += 1\n", " return best_clf" ] }, { "cell_type": "code", "execution_count": 67, "metadata": { "tags": [] }, "outputs": [], "source": [ "# Calculates the probability of null hypthesis (estimator not better than random prediction)\n", "def calc_prob_null_hypothesis(n_permutations, n_score):\n", " '''\n", " n_permutations: number of permutations\n", " n_score: number of times with better prediction\n", " return: probability of null hypothesis €[0, 1]\n", " '''\n", " return (1+n_score)/(1+n_permutations)" ] }, { "cell_type": "code", "execution_count": 70, "metadata": { "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Searching for best clf in {'C': [0.1, 1, 10, 100], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'gamma': ['auto', 'scale']} with 1521 datapoints with following features: ['Instances', 'AvTime', 'StDtime', 'TotalDwellTime']\n", "Performing Grid Search...\n", "Fitting 5 folds for each of 32 candidates, totalling 160 fits\n", "Seed 7 - Split Variant 0 - Test m-Accuracy: 0.9019607843137255\n", "Performing fit of best model with params: C - 0.1; Kernel - linear; Gamma - auto on full data set...\n", "Seed 7 - Split Variant 0 - n-Accuracy: 0.9019607843137255\n" ] } ], "source": [ "feats_of_interest = picked_feats_record[:4]\n", "best_clf = get_best_clf_from_specific_feats(feats_of_interest)\n", "reduced_X_data = get_x_data_made_of_specific_feats(feats_of_interest)" ] }, { "cell_type": "code", "execution_count": 71, "metadata": { "tags": [] }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 999/999 [01:28<00:00, 11.31it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Probability of null hypothesis: 0.001\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "# Number of permutations\n", "n_permutations = 999\n", "# Score on original data\n", "data_score = get_F1Score(best_clf, reduced_X_data, y_bad)\n", "# Get scores for permutated labels\n", "scores = []\n", "y_bad_np = y_bad.to_numpy()\n", "for permutation in tqdm(range(n_permutations)):\n", " #print(f\"Calculating permutation {permutation}/{n_permutations}\", end=\"\\r\")#, flush=True)\n", " permutated_y_labels = permutate(y_bad_np)\n", " scores.append(get_F1Score(best_clf, reduced_X_data, permutated_y_labels))\n", "# Calculate number of times where permutated labels performed better\n", "n_score = np.sum([score >= data_score for score in scores])\n", "# Calculate probability for null hypothesis\n", "p_hypothesis = calc_prob_null_hypothesis(n_permutations, n_score)\n", "print(f\"Probability of null hypothesis: {p_hypothesis}\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.5" } }, "nbformat": 4, "nbformat_minor": 4 }