diff --git a/examples/verbose_example.ipynb b/examples/verbose_example.ipynb index 643739f7..4985eba6 100644 --- a/examples/verbose_example.ipynb +++ b/examples/verbose_example.ipynb @@ -13,7 +13,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -69,7 +68,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -79,7 +77,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -103,7 +100,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -111,7 +107,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -121,7 +116,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -225,7 +219,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -233,7 +226,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -246,7 +238,16 @@ "cell_type": "code", "execution_count": 7, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/snil/Documents/tsflex/tsflex/utils/logging.py:81: RuntimeWarning: Logging file (example_processing_logs.log) already exists. This file will be overwritten!\n", + " warnings.warn(\n" + ] + } + ], "source": [ "_ = series_pipe.process([df_gsr, df_acc, df_tmp], logging_file_path=\"example_processing_logs.log\")" ] @@ -288,48 +289,48 @@ " \n", " \n", " 0\n", - " 2023-05-05 12:16:53.596\n", + " 2023-08-16 12:45:29.484\n", " clip_quantiles\n", " (TMP,)\n", " TMP\n", - " 0 days 00:00:00.002578633\n", - " 4.73\n", + " 0 days 00:00:00.002462031\n", + " 7.67\n", " \n", " \n", " 1\n", - " 2023-05-05 12:16:53.597\n", + " 2023-08-16 12:45:29.486\n", " savgol_filter\n", " (EDA,)\n", " EDA\n", - " 0 days 00:00:00.000956786\n", - " 1.76\n", + " 0 days 00:00:00.001066185\n", + " 3.32\n", " \n", " \n", " 2\n", - " 2023-05-05 12:16:53.628\n", + " 2023-08-16 12:45:29.505\n", " savgol_filter\n", " (ACC_x,), (ACC_y,), (ACC_z,)\n", " ACC_x, ACC_y, ACC_z\n", - " 0 days 00:00:00.030402122\n", - " 55.82\n", + " 0 days 00:00:00.019186704\n", + " 59.79\n", " \n", " \n", " 3\n", - " 2023-05-05 12:16:53.635\n", + " 2023-08-16 12:45:29.510\n", " smv\n", " (ACC_x, ACC_y, ACC_z)\n", " ACC_SMV\n", - " 0 days 00:00:00.006881422\n", - " 12.63\n", + " 0 days 00:00:00.004629862\n", + " 14.43\n", " \n", " \n", " 4\n", - " 2023-05-05 12:16:53.649\n", + " 2023-08-16 12:45:29.514\n", " clip_quantiles\n", " (ACC_SMV,)\n", " ACC_SMV\n", - " 0 days 00:00:00.013646172\n", - " 25.05\n", + " 0 days 00:00:00.004746801\n", + " 14.79\n", " \n", " \n", "\n", @@ -337,18 +338,18 @@ ], "text/plain": [ " log_time function series_names \\\n", - "0 2023-05-05 12:16:53.596 clip_quantiles (TMP,) \n", - "1 2023-05-05 12:16:53.597 savgol_filter (EDA,) \n", - "2 2023-05-05 12:16:53.628 savgol_filter (ACC_x,), (ACC_y,), (ACC_z,) \n", - "3 2023-05-05 12:16:53.635 smv (ACC_x, ACC_y, ACC_z) \n", - "4 2023-05-05 12:16:53.649 clip_quantiles (ACC_SMV,) \n", + "0 2023-08-16 12:45:29.484 clip_quantiles (TMP,) \n", + "1 2023-08-16 12:45:29.486 savgol_filter (EDA,) \n", + "2 2023-08-16 12:45:29.505 savgol_filter (ACC_x,), (ACC_y,), (ACC_z,) \n", + "3 2023-08-16 12:45:29.510 smv (ACC_x, ACC_y, ACC_z) \n", + "4 2023-08-16 12:45:29.514 clip_quantiles (ACC_SMV,) \n", "\n", " output_names duration duration % \n", - "0 TMP 0 days 00:00:00.002578633 4.73 \n", - "1 EDA 0 days 00:00:00.000956786 1.76 \n", - "2 ACC_x, ACC_y, ACC_z 0 days 00:00:00.030402122 55.82 \n", - "3 ACC_SMV 0 days 00:00:00.006881422 12.63 \n", - "4 ACC_SMV 0 days 00:00:00.013646172 25.05 " + "0 TMP 0 days 00:00:00.002462031 7.67 \n", + "1 EDA 0 days 00:00:00.001066185 3.32 \n", + "2 ACC_x, ACC_y, ACC_z 0 days 00:00:00.019186704 59.79 \n", + "3 ACC_SMV 0 days 00:00:00.004629862 14.43 \n", + "4 ACC_SMV 0 days 00:00:00.004746801 14.79 " ] }, "execution_count": 8, @@ -361,7 +362,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -369,7 +369,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": { "tags": [] @@ -382,16 +381,7 @@ "cell_type": "code", "execution_count": 9, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/jeroen/.cache/pypoetry/virtualenvs/tsflex-5Y4iXlk8-py3.10/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", - " from .autonotebook import tqdm as notebook_tqdm\n" - ] - } - ], + "outputs": [], "source": [ "try: \n", " from tsflex.features import FeatureCollection, FuncWrapper\n", @@ -406,7 +396,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": { "tags": [] @@ -416,7 +405,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -515,7 +503,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": { "tags": [] @@ -525,7 +512,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -551,7 +537,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": { "tags": [] @@ -561,7 +546,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -574,7 +558,6 @@ "cell_type": "code", "execution_count": 11, "metadata": { - "collapsed": false, "pycharm": { "name": "#%%\n" } @@ -610,22 +593,22 @@ " \n", " \n", " \n", - " 2017-06-13 15:41:21+02:00\n", - " 34.369999\n", + " 2017-06-13 14:28:02.500000+02:00\n", + " 30.35\n", " \n", " \n", - " 2017-06-13 15:00:18.750000+02:00\n", - " 35.500000\n", + " 2017-06-13 15:49:32.500000+02:00\n", + " 33.68\n", " \n", " \n", "\n", "" ], "text/plain": [ - " TMP\n", - "timestamp \n", - "2017-06-13 15:41:21+02:00 34.369999\n", - "2017-06-13 15:00:18.750000+02:00 35.500000" + " TMP\n", + "timestamp \n", + "2017-06-13 14:28:02.500000+02:00 30.35\n", + "2017-06-13 15:49:32.500000+02:00 33.68" ] }, "execution_count": 11, @@ -638,7 +621,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -753,36 +735,36 @@ " \n", " \n", " \n", - " 2017-06-13 15:19:13+02:00\n", - " 35.029999\n", - " 34.930000\n", - " 8394.200195\n", - " -0.810092\n", - " 34.975834\n", - " 34.950001\n", - " 34.990002\n", - " 34.990002\n", - " 34.975845\n", - " -0.058798\n", - " -0.000345\n", - " 0.028884\n", - " 0.000834\n", - " \n", - " \n", - " 2017-06-13 16:17:43+02:00\n", - " 30.250000\n", - " 29.809999\n", - " 7206.559570\n", - " -1.437857\n", - " 30.027332\n", - " 29.889999\n", - " 30.030001\n", - " 30.170000\n", - " 30.027679\n", - " 0.071025\n", - " 0.001219\n", - " 0.144382\n", - " 0.020846\n", + " 2017-06-13 14:38:43+02:00\n", + " 32.790001\n", + " 32.290001\n", + " 7803.440430\n", + " -1.432049\n", + " 32.514336\n", + " 32.330002\n", + " 32.510000\n", + " 32.660000\n", + " 32.514774\n", + " 0.209048\n", + " 0.002374\n", + " 0.16899\n", + " 0.028558\n", + " \n", + " \n", + " 2017-06-13 15:17:13+02:00\n", + " 34.610001\n", + " 34.430000\n", + " 8278.279297\n", + " -0.480542\n", + " 34.492832\n", + " 34.450001\n", + " 34.470001\n", + " 34.529999\n", + " 34.492874\n", + " 0.847968\n", + " 0.000680\n", + " 0.05345\n", + " 0.002857\n", " \n", " \n", "\n", @@ -791,33 +773,33 @@ "text/plain": [ " TMP__amax__w=1m TMP__amin__w=1m TMP__area__w=1m \\\n", "timestamp \n", - "2017-06-13 15:19:13+02:00 35.029999 34.930000 8394.200195 \n", - "2017-06-13 16:17:43+02:00 30.250000 29.809999 7206.559570 \n", + "2017-06-13 14:38:43+02:00 32.790001 32.290001 7803.440430 \n", + "2017-06-13 15:17:13+02:00 34.610001 34.430000 8278.279297 \n", "\n", " TMP__kurtosis__w=1m TMP__mean__w=1m \\\n", "timestamp \n", - "2017-06-13 15:19:13+02:00 -0.810092 34.975834 \n", - "2017-06-13 16:17:43+02:00 -1.437857 30.027332 \n", + "2017-06-13 14:38:43+02:00 -1.432049 32.514336 \n", + "2017-06-13 15:17:13+02:00 -0.480542 34.492832 \n", "\n", " TMP__quantile_0.25__w=1m TMP__quantile_0.5__w=1m \\\n", "timestamp \n", - "2017-06-13 15:19:13+02:00 34.950001 34.990002 \n", - "2017-06-13 16:17:43+02:00 29.889999 30.030001 \n", + "2017-06-13 14:38:43+02:00 32.330002 32.510000 \n", + "2017-06-13 15:17:13+02:00 34.450001 34.470001 \n", "\n", " TMP__quantile_0.75__w=1m TMP__rms__w=1m \\\n", "timestamp \n", - "2017-06-13 15:19:13+02:00 34.990002 34.975845 \n", - "2017-06-13 16:17:43+02:00 30.170000 30.027679 \n", + "2017-06-13 14:38:43+02:00 32.660000 32.514774 \n", + "2017-06-13 15:17:13+02:00 34.529999 34.492874 \n", "\n", " TMP__skew__w=1m TMP__slope__w=1m TMP__std__w=1m \\\n", "timestamp \n", - "2017-06-13 15:19:13+02:00 -0.058798 -0.000345 0.028884 \n", - "2017-06-13 16:17:43+02:00 0.071025 0.001219 0.144382 \n", + "2017-06-13 14:38:43+02:00 0.209048 0.002374 0.16899 \n", + "2017-06-13 15:17:13+02:00 0.847968 0.000680 0.05345 \n", "\n", " TMP__var__w=1m \n", "timestamp \n", - "2017-06-13 15:19:13+02:00 0.000834 \n", - "2017-06-13 16:17:43+02:00 0.020846 " + "2017-06-13 14:38:43+02:00 0.028558 \n", + "2017-06-13 15:17:13+02:00 0.002857 " ] }, "execution_count": 13, @@ -838,7 +820,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -846,7 +827,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -854,7 +834,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -873,22 +852,22 @@ "EDA: (\n", "\twin: 1h : [\n", "\t\tFeatureDescriptor - func: FuncWrapper(mean, ['mean'], {}) stride: ['30s'],\n", - "\t\tFeatureDescriptor - func: FuncWrapper(skew, ['skew'], {}) stride: ['15s'],\n", - "\t\tFeatureDescriptor - func: FuncWrapper(kurtosis, ['kurtosis'], {}) stride: ['30s'],\n", - "\t\tFeatureDescriptor - func: FuncWrapper(sum, ['area'], {}) stride: ['30s'],\n", - "\t]\n", - "\twin: 30s : [\n", - "\t\tFeatureDescriptor - func: FuncWrapper(std, ['std'], {}) stride: ['15s'],\n", - "\t\tFeatureDescriptor - func: FuncWrapper(var, ['var'], {}) stride: ['15s'],\n", - "\t\tFeatureDescriptor - func: FuncWrapper(quantile, ['quantile_0.25', 'quantile_0.5', 'quantile_0.75'], {'q': [0.25, 0.5, 0.75]}) stride: ['30s'],\n", "\t]\n", "\twin: 1m30s : [\n", + "\t\tFeatureDescriptor - func: FuncWrapper(std, ['std'], {}) stride: ['30s'],\n", + "\t\tFeatureDescriptor - func: FuncWrapper(var, ['var'], {}) stride: ['30s'],\n", + "\t\tFeatureDescriptor - func: FuncWrapper(kurtosis, ['kurtosis'], {}) stride: ['15s'],\n", + "\t\tFeatureDescriptor - func: FuncWrapper(slope, ['slope'], {}) stride: ['15s'],\n", + "\t\tFeatureDescriptor - func: FuncWrapper(sum, ['area'], {}) stride: ['15s'],\n", + "\t]\n", + "\twin: 2m : [\n", "\t\tFeatureDescriptor - func: FuncWrapper(amax, ['amax'], {}) stride: ['15s'],\n", "\t\tFeatureDescriptor - func: FuncWrapper(amin, ['amin'], {}) stride: ['15s'],\n", - "\t\tFeatureDescriptor - func: FuncWrapper(, ['rms'], {}) stride: ['30s'],\n", "\t]\n", - "\twin: 2m : [\n", - "\t\tFeatureDescriptor - func: FuncWrapper(slope, ['slope'], {}) stride: ['15s'],\n", + "\twin: 30s : [\n", + "\t\tFeatureDescriptor - func: FuncWrapper(skew, ['skew'], {}) stride: ['15s'],\n", + "\t\tFeatureDescriptor - func: FuncWrapper(quantile, ['quantile_0.25', 'quantile_0.5', 'quantile_0.75'], {'q': [0.25, 0.5, 0.75]}) stride: ['30s'],\n", + "\t\tFeatureDescriptor - func: FuncWrapper(, ['rms'], {}) stride: ['15s'],\n", "\t]\n", ")\n", "\n", @@ -916,19 +895,19 @@ " \n", " \n", " \n", - " EDA__amax__w=1m30s\n", - " EDA__amin__w=1m30s\n", - " EDA__area__w=1h\n", - " EDA__kurtosis__w=1h\n", + " EDA__amax__w=2m\n", + " EDA__amin__w=2m\n", + " EDA__area__w=1m30s\n", + " EDA__kurtosis__w=1m30s\n", " EDA__mean__w=1h\n", " EDA__quantile_0.25__w=30s\n", " EDA__quantile_0.5__w=30s\n", " EDA__quantile_0.75__w=30s\n", - " EDA__rms__w=1m30s\n", - " EDA__skew__w=1h\n", - " EDA__slope__w=2m\n", - " EDA__std__w=30s\n", - " EDA__var__w=30s\n", + " EDA__rms__w=30s\n", + " EDA__skew__w=30s\n", + " EDA__slope__w=1m30s\n", + " EDA__std__w=1m30s\n", + " EDA__var__w=1m30s\n", " \n", " \n", " timestamp\n", @@ -949,76 +928,76 @@ " \n", " \n", " \n", - " 2017-06-13 16:09:43+02:00\n", - " 1.207757\n", - " 1.139973\n", - " 30162.832031\n", - " -0.272705\n", - " 2.094641\n", - " 1.157878\n", - " 1.160436\n", - " 1.166831\n", - " 1.168666\n", - " 0.94309\n", - " -0.000009\n", - " 0.005837\n", - " 0.000034\n", - " \n", - " \n", - " 2017-06-13 15:02:58+02:00\n", - " 0.780491\n", - " 0.703754\n", - " NaN\n", - " NaN\n", + " 2017-06-13 16:12:13+02:00\n", + " 1.220547\n", + " 1.012079\n", + " 398.828674\n", + " 7.343093\n", + " 2.056634\n", + " 1.082421\n", + " 1.095849\n", + " 1.106721\n", + " 1.095335\n", + " -0.420529\n", + " -0.000089\n", + " 0.016636\n", + " 0.000277\n", + " \n", + " \n", + " 2017-06-13 14:52:58+02:00\n", + " 0.950591\n", + " 0.777933\n", + " 306.619629\n", + " -1.330574\n", " NaN\n", " NaN\n", " NaN\n", " NaN\n", + " 0.814525\n", + " 0.054707\n", + " -0.000287\n", " NaN\n", " NaN\n", - " -0.000113\n", - " 0.005096\n", - " 0.000026\n", " \n", " \n", "\n", "" ], "text/plain": [ - " EDA__amax__w=1m30s EDA__amin__w=1m30s \\\n", - "timestamp \n", - "2017-06-13 16:09:43+02:00 1.207757 1.139973 \n", - "2017-06-13 15:02:58+02:00 0.780491 0.703754 \n", + " EDA__amax__w=2m EDA__amin__w=2m \\\n", + "timestamp \n", + "2017-06-13 16:12:13+02:00 1.220547 1.012079 \n", + "2017-06-13 14:52:58+02:00 0.950591 0.777933 \n", "\n", - " EDA__area__w=1h EDA__kurtosis__w=1h \\\n", - "timestamp \n", - "2017-06-13 16:09:43+02:00 30162.832031 -0.272705 \n", - "2017-06-13 15:02:58+02:00 NaN NaN \n", + " EDA__area__w=1m30s EDA__kurtosis__w=1m30s \\\n", + "timestamp \n", + "2017-06-13 16:12:13+02:00 398.828674 7.343093 \n", + "2017-06-13 14:52:58+02:00 306.619629 -1.330574 \n", "\n", " EDA__mean__w=1h EDA__quantile_0.25__w=30s \\\n", "timestamp \n", - "2017-06-13 16:09:43+02:00 2.094641 1.157878 \n", - "2017-06-13 15:02:58+02:00 NaN NaN \n", + "2017-06-13 16:12:13+02:00 2.056634 1.082421 \n", + "2017-06-13 14:52:58+02:00 NaN NaN \n", "\n", " EDA__quantile_0.5__w=30s \\\n", "timestamp \n", - "2017-06-13 16:09:43+02:00 1.160436 \n", - "2017-06-13 15:02:58+02:00 NaN \n", + "2017-06-13 16:12:13+02:00 1.095849 \n", + "2017-06-13 14:52:58+02:00 NaN \n", "\n", - " EDA__quantile_0.75__w=30s EDA__rms__w=1m30s \\\n", - "timestamp \n", - "2017-06-13 16:09:43+02:00 1.166831 1.168666 \n", - "2017-06-13 15:02:58+02:00 NaN NaN \n", + " EDA__quantile_0.75__w=30s EDA__rms__w=30s \\\n", + "timestamp \n", + "2017-06-13 16:12:13+02:00 1.106721 1.095335 \n", + "2017-06-13 14:52:58+02:00 NaN 0.814525 \n", "\n", - " EDA__skew__w=1h EDA__slope__w=2m EDA__std__w=30s \\\n", - "timestamp \n", - "2017-06-13 16:09:43+02:00 0.94309 -0.000009 0.005837 \n", - "2017-06-13 15:02:58+02:00 NaN -0.000113 0.005096 \n", + " EDA__skew__w=30s EDA__slope__w=1m30s \\\n", + "timestamp \n", + "2017-06-13 16:12:13+02:00 -0.420529 -0.000089 \n", + "2017-06-13 14:52:58+02:00 0.054707 -0.000287 \n", "\n", - " EDA__var__w=30s \n", - "timestamp \n", - "2017-06-13 16:09:43+02:00 0.000034 \n", - "2017-06-13 15:02:58+02:00 0.000026 " + " EDA__std__w=1m30s EDA__var__w=1m30s \n", + "timestamp \n", + "2017-06-13 16:12:13+02:00 0.016636 0.000277 \n", + "2017-06-13 14:52:58+02:00 NaN NaN " ] }, "execution_count": 14, @@ -1052,7 +1031,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -1094,7 +1072,7 @@ " \n", " \n", " \n", - " EDA__area__w=1h\n", + " EDA__amax__w=2m\n", " \n", " \n", " timestamp\n", @@ -1103,27 +1081,27 @@ " \n", " \n", " \n", - " 2017-06-13 15:26:43+02:00\n", - " 15623.870117\n", + " 2017-06-13 15:55:28+02:00\n", + " 4.517331\n", " \n", " \n", - " 2017-06-13 16:18:13+02:00\n", - " 28911.509766\n", + " 2017-06-13 16:05:58+02:00\n", + " 1.471220\n", " \n", " \n", - " 2017-06-13 16:19:43+02:00\n", - " 28819.720703\n", + " 2017-06-13 15:51:43+02:00\n", + " 4.738588\n", " \n", " \n", "\n", "" ], "text/plain": [ - " EDA__area__w=1h\n", + " EDA__amax__w=2m\n", "timestamp \n", - "2017-06-13 15:26:43+02:00 15623.870117\n", - "2017-06-13 16:18:13+02:00 28911.509766\n", - "2017-06-13 16:19:43+02:00 28819.720703" + "2017-06-13 15:55:28+02:00 4.517331\n", + "2017-06-13 16:05:58+02:00 1.471220\n", + "2017-06-13 15:51:43+02:00 4.738588" ] }, "execution_count": 15, @@ -1138,7 +1116,633 @@ ] }, { - "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Rounded start index feature extraction" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "When working with data that contain 'unclean' timestamps, it is possible to prettify the results by using the `exact_time` parameter.\n", + "\n", + "Here you can see some data that have a very small resolution." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ACC_xACC_yACC_z
timestamp
2017-06-13 14:22:13.937500+02:001663
2017-06-13 14:22:13.968750+02:001663
2017-06-13 14:22:14+02:00-1463
2017-06-13 14:22:14.031250+02:00-2463
2017-06-13 14:22:14.062500+02:000563
\n", + "
" + ], + "text/plain": [ + " ACC_x ACC_y ACC_z\n", + "timestamp \n", + "2017-06-13 14:22:13.937500+02:00 1 6 63\n", + "2017-06-13 14:22:13.968750+02:00 1 6 63\n", + "2017-06-13 14:22:14+02:00 -1 4 63\n", + "2017-06-13 14:22:14.031250+02:00 -2 4 63\n", + "2017-06-13 14:22:14.062500+02:00 0 5 63" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_acc.iloc[30:].head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "ACC_x: (\n", + "\twin: 30s : [\n", + "\t\tFeatureDescriptor - func: FuncWrapper(std, ['std'], {}) stride: ['10s'],\n", + "\t]\n", + ")" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "fc = FeatureCollection(\n", + " [\n", + "FeatureDescriptor(\n", + " series_name=\"ACC_x\",\n", + " window='30s',\n", + " stride='10s',\n", + " function=np.std,\n", + " )\n", + " ]\n", + ")\n", + "\n", + "fc" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ACC_x__std__w=30s
timestamp
2017-06-13 14:22:43.937500+02:000.771801
2017-06-13 14:22:53.937500+02:000.799006
2017-06-13 14:23:03.937500+02:000.659779
2017-06-13 14:23:13.937500+02:000.624957
2017-06-13 14:23:23.937500+02:000.700799
......
2017-06-13 16:27:13.937500+02:0018.077957
2017-06-13 16:27:23.937500+02:008.099124
2017-06-13 16:27:33.937500+02:0014.798578
2017-06-13 16:27:43.937500+02:0014.624782
2017-06-13 16:27:53.937500+02:0013.951030
\n", + "

752 rows × 1 columns

\n", + "
" + ], + "text/plain": [ + " ACC_x__std__w=30s\n", + "timestamp \n", + "2017-06-13 14:22:43.937500+02:00 0.771801\n", + "2017-06-13 14:22:53.937500+02:00 0.799006\n", + "2017-06-13 14:23:03.937500+02:00 0.659779\n", + "2017-06-13 14:23:13.937500+02:00 0.624957\n", + "2017-06-13 14:23:23.937500+02:00 0.700799\n", + "... ...\n", + "2017-06-13 16:27:13.937500+02:00 18.077957\n", + "2017-06-13 16:27:23.937500+02:00 8.099124\n", + "2017-06-13 16:27:33.937500+02:00 14.798578\n", + "2017-06-13 16:27:43.937500+02:00 14.624782\n", + "2017-06-13 16:27:53.937500+02:00 13.951030\n", + "\n", + "[752 rows x 1 columns]" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "fc.calculate(df_acc.iloc[30:])[0]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can, for example, have a resolution of 1 second as seen below.\n", + "\n", + "**note:**\n", + "This can result in (slightly) altered results than when not rounding the timestamps. This will also depend on the resolution used." + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ACC_x__std__w=30s
timestamp
2017-06-13 14:22:44+02:000.770055
2017-06-13 14:22:54+02:000.798542
2017-06-13 14:23:04+02:000.660551
2017-06-13 14:23:14+02:000.624135
2017-06-13 14:23:24+02:000.700799
......
2017-06-13 16:27:14+02:0018.065915
2017-06-13 16:27:24+02:008.160273
2017-06-13 16:27:34+02:0014.800646
2017-06-13 16:27:44+02:0014.624782
2017-06-13 16:27:54+02:0013.952225
\n", + "

752 rows × 1 columns

\n", + "
" + ], + "text/plain": [ + " ACC_x__std__w=30s\n", + "timestamp \n", + "2017-06-13 14:22:44+02:00 0.770055\n", + "2017-06-13 14:22:54+02:00 0.798542\n", + "2017-06-13 14:23:04+02:00 0.660551\n", + "2017-06-13 14:23:14+02:00 0.624135\n", + "2017-06-13 14:23:24+02:00 0.700799\n", + "... ...\n", + "2017-06-13 16:27:14+02:00 18.065915\n", + "2017-06-13 16:27:24+02:00 8.160273\n", + "2017-06-13 16:27:34+02:00 14.800646\n", + "2017-06-13 16:27:44+02:00 14.624782\n", + "2017-06-13 16:27:54+02:00 13.952225\n", + "\n", + "[752 rows x 1 columns]" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "fc.calculate(df_acc.iloc[30:], exact_time='1s')[0]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For time indices, you can also use a timedelta object." + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ACC_x__std__w=30s
timestamp
2017-06-13 14:22:50+02:000.783929
2017-06-13 14:23:00+02:000.798185
2017-06-13 14:23:10+02:000.665362
2017-06-13 14:23:20+02:000.631974
2017-06-13 14:23:30+02:000.683508
......
2017-06-13 16:27:20+02:0011.929327
2017-06-13 16:27:30+02:0014.153599
2017-06-13 16:27:40+02:0014.686610
2017-06-13 16:27:50+02:0014.468111
2017-06-13 16:28:00+02:0010.192261
\n", + "

752 rows × 1 columns

\n", + "
" + ], + "text/plain": [ + " ACC_x__std__w=30s\n", + "timestamp \n", + "2017-06-13 14:22:50+02:00 0.783929\n", + "2017-06-13 14:23:00+02:00 0.798185\n", + "2017-06-13 14:23:10+02:00 0.665362\n", + "2017-06-13 14:23:20+02:00 0.631974\n", + "2017-06-13 14:23:30+02:00 0.683508\n", + "... ...\n", + "2017-06-13 16:27:20+02:00 11.929327\n", + "2017-06-13 16:27:30+02:00 14.153599\n", + "2017-06-13 16:27:40+02:00 14.686610\n", + "2017-06-13 16:27:50+02:00 14.468111\n", + "2017-06-13 16:28:00+02:00 10.192261\n", + "\n", + "[752 rows x 1 columns]" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "fc.calculate(df_acc.iloc[30:], exact_time=pd.Timedelta(10, 'seconds'))[0]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Or when setting it to `False`, it rounds using the LCM of strides and window. (In this case, the first index will be rounded to a multiple of 30 seconds)." + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ACC_x__std__w=30s
timestamp
2017-06-13 14:23:00+02:000.798185
2017-06-13 14:23:10+02:000.665362
2017-06-13 14:23:20+02:000.631974
2017-06-13 14:23:30+02:000.683508
2017-06-13 14:23:40+02:000.629029
......
2017-06-13 16:27:20+02:0011.929327
2017-06-13 16:27:30+02:0014.153599
2017-06-13 16:27:40+02:0014.686610
2017-06-13 16:27:50+02:0014.468111
2017-06-13 16:28:00+02:0010.192261
\n", + "

751 rows × 1 columns

\n", + "
" + ], + "text/plain": [ + " ACC_x__std__w=30s\n", + "timestamp \n", + "2017-06-13 14:23:00+02:00 0.798185\n", + "2017-06-13 14:23:10+02:00 0.665362\n", + "2017-06-13 14:23:20+02:00 0.631974\n", + "2017-06-13 14:23:30+02:00 0.683508\n", + "2017-06-13 14:23:40+02:00 0.629029\n", + "... ...\n", + "2017-06-13 16:27:20+02:00 11.929327\n", + "2017-06-13 16:27:30+02:00 14.153599\n", + "2017-06-13 16:27:40+02:00 14.686610\n", + "2017-06-13 16:27:50+02:00 14.468111\n", + "2017-06-13 16:28:00+02:00 10.192261\n", + "\n", + "[751 rows x 1 columns]" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "fc.calculate(df_acc.iloc[30:], exact_time=False)[0]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This functionality can also be used in sequence data, where the indices are numeric. In that case, the values of window, stride should be numeric and the value of exact_time can either be a `bool`, `int` or `float`.\n", + "\n", + "The full description of this parameter can be found in this table:\n", + "\n", + "| index datatype | rounding datatype | return datatype | extra info |\n", + "| :------------- | :---------------- | --------------: | ----------------------------------------------------------------: |\n", + "| int | int | int | round `index` to nearest multiple of `rounding` |\n", + "| | float | float | round `index` to nearest multiple of `rounding` |\n", + "| | bool | float | round `index` to lcm of `window` and/or `stride` |\n", + "| float | int | float | round `index` to nearest multiple of `rounding` |\n", + "| | float | float | round `index` to nearest multiple of `rounding` |\n", + "| | bool | float | round `index` to lcm of `window` and/or `stride` |\n", + "| pd.timestamp | str | pd.timestamp | round `index` to resolution of `rounding` (e.g. '10s', '2m', 'h') |\n", + "| | bool | pd.timestamp | round `index` to lcm of `window` and/or `stride` |\n", + "| | pd.timedelta | pd.timestamp | round `index` to nearest multiple of `rounding`. |\n" + ] + }, + { "cell_type": "markdown", "metadata": { "tags": [] @@ -1161,22 +1765,22 @@ "EDA: (\n", "\twin: 1h : [\n", "\t\tFeatureDescriptor - func: FuncWrapper(mean, ['mean'], {}) stride: ['30s'],\n", - "\t\tFeatureDescriptor - func: FuncWrapper(skew, ['skew'], {}) stride: ['15s'],\n", - "\t\tFeatureDescriptor - func: FuncWrapper(kurtosis, ['kurtosis'], {}) stride: ['30s'],\n", - "\t\tFeatureDescriptor - func: FuncWrapper(sum, ['area'], {}) stride: ['30s'],\n", - "\t]\n", - "\twin: 30s : [\n", - "\t\tFeatureDescriptor - func: FuncWrapper(std, ['std'], {}) stride: ['15s'],\n", - "\t\tFeatureDescriptor - func: FuncWrapper(var, ['var'], {}) stride: ['15s'],\n", - "\t\tFeatureDescriptor - func: FuncWrapper(quantile, ['quantile_0.25', 'quantile_0.5', 'quantile_0.75'], {'q': [0.25, 0.5, 0.75]}) stride: ['30s'],\n", "\t]\n", "\twin: 1m30s : [\n", + "\t\tFeatureDescriptor - func: FuncWrapper(std, ['std'], {}) stride: ['30s'],\n", + "\t\tFeatureDescriptor - func: FuncWrapper(var, ['var'], {}) stride: ['30s'],\n", + "\t\tFeatureDescriptor - func: FuncWrapper(kurtosis, ['kurtosis'], {}) stride: ['15s'],\n", + "\t\tFeatureDescriptor - func: FuncWrapper(slope, ['slope'], {}) stride: ['15s'],\n", + "\t\tFeatureDescriptor - func: FuncWrapper(sum, ['area'], {}) stride: ['15s'],\n", + "\t]\n", + "\twin: 2m : [\n", "\t\tFeatureDescriptor - func: FuncWrapper(amax, ['amax'], {}) stride: ['15s'],\n", "\t\tFeatureDescriptor - func: FuncWrapper(amin, ['amin'], {}) stride: ['15s'],\n", - "\t\tFeatureDescriptor - func: FuncWrapper(, ['rms'], {}) stride: ['30s'],\n", "\t]\n", - "\twin: 2m : [\n", - "\t\tFeatureDescriptor - func: FuncWrapper(slope, ['slope'], {}) stride: ['15s'],\n", + "\twin: 30s : [\n", + "\t\tFeatureDescriptor - func: FuncWrapper(skew, ['skew'], {}) stride: ['15s'],\n", + "\t\tFeatureDescriptor - func: FuncWrapper(quantile, ['quantile_0.25', 'quantile_0.5', 'quantile_0.75'], {'q': [0.25, 0.5, 0.75]}) stride: ['30s'],\n", + "\t\tFeatureDescriptor - func: FuncWrapper(, ['rms'], {}) stride: ['15s'],\n", "\t]\n", ")\n", "TMP: (\n", @@ -1219,16 +1823,16 @@ " \n", " \n", " \n", - " EDA__amax__w=1m30s\n", - " EDA__amin__w=1m30s\n", - " EDA__area__w=1h\n", - " EDA__kurtosis__w=1h\n", + " EDA__amax__w=2m\n", + " EDA__amin__w=2m\n", + " EDA__area__w=1m30s\n", + " EDA__kurtosis__w=1m30s\n", " EDA__mean__w=1h\n", " EDA__quantile_0.25__w=30s\n", " EDA__quantile_0.5__w=30s\n", " EDA__quantile_0.75__w=30s\n", - " EDA__rms__w=1m30s\n", - " EDA__skew__w=1h\n", + " EDA__rms__w=30s\n", + " EDA__skew__w=30s\n", " ...\n", " TMP__kurtosis__w=1m\n", " TMP__mean__w=1m\n", @@ -1268,52 +1872,52 @@ " \n", " \n", " \n", - " 2017-06-13 15:19:13+02:00\n", - " 1.272983\n", - " 1.141252\n", + " 2017-06-13 15:39:28+02:00\n", + " 3.183709\n", + " 1.596755\n", + " 921.28009\n", + " 0.894696\n", " NaN\n", " NaN\n", " NaN\n", - " 1.150205\n", - " 1.156600\n", - " 1.164273\n", - " 1.191974\n", " NaN\n", + " 2.297914\n", + " -2.049654\n", " ...\n", - " -0.810092\n", - " 34.975834\n", - " 34.950001\n", - " 34.990002\n", - " 34.990002\n", - " 34.975845\n", - " -0.058798\n", - " -0.000345\n", - " 0.028884\n", - " 0.000834\n", - " \n", - " \n", - " 2017-06-13 15:29:43+02:00\n", - " 1.610625\n", - " 1.311352\n", - " 16156.117188\n", - " -0.311375\n", - " 1.121953\n", - " 1.327978\n", - " 1.352278\n", - " 1.368264\n", - " 1.439992\n", - " 0.933501\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " \n", + " \n", + " 2017-06-13 15:40:13+02:00\n", + " 2.855021\n", + " 1.596755\n", + " 827.42395\n", + " 1.085358\n", + " 1.263708\n", + " 2.137228\n", + " 2.169202\n", + " 2.221959\n", + " 2.179614\n", + " 0.264285\n", " ...\n", - " -0.514506\n", - " 33.743000\n", - " 33.730000\n", - " 33.740000\n", - " 33.750000\n", - " 33.743008\n", - " 0.442661\n", - " 0.000050\n", - " 0.022753\n", - " 0.000518\n", + " -1.026206\n", + " 34.577831\n", + " 34.549999\n", + " 34.57\n", + " 34.610001\n", + " 34.577858\n", + " 0.256684\n", + " -0.000537\n", + " 0.040788\n", + " 0.001664\n", " \n", " \n", "\n", @@ -1321,55 +1925,55 @@ "" ], "text/plain": [ - " EDA__amax__w=1m30s EDA__amin__w=1m30s \\\n", - "timestamp \n", - "2017-06-13 15:19:13+02:00 1.272983 1.141252 \n", - "2017-06-13 15:29:43+02:00 1.610625 1.311352 \n", + " EDA__amax__w=2m EDA__amin__w=2m \\\n", + "timestamp \n", + "2017-06-13 15:39:28+02:00 3.183709 1.596755 \n", + "2017-06-13 15:40:13+02:00 2.855021 1.596755 \n", "\n", - " EDA__area__w=1h EDA__kurtosis__w=1h \\\n", - "timestamp \n", - "2017-06-13 15:19:13+02:00 NaN NaN \n", - "2017-06-13 15:29:43+02:00 16156.117188 -0.311375 \n", + " EDA__area__w=1m30s EDA__kurtosis__w=1m30s \\\n", + "timestamp \n", + "2017-06-13 15:39:28+02:00 921.28009 0.894696 \n", + "2017-06-13 15:40:13+02:00 827.42395 1.085358 \n", "\n", " EDA__mean__w=1h EDA__quantile_0.25__w=30s \\\n", "timestamp \n", - "2017-06-13 15:19:13+02:00 NaN 1.150205 \n", - "2017-06-13 15:29:43+02:00 1.121953 1.327978 \n", + "2017-06-13 15:39:28+02:00 NaN NaN \n", + "2017-06-13 15:40:13+02:00 1.263708 2.137228 \n", "\n", " EDA__quantile_0.5__w=30s \\\n", "timestamp \n", - "2017-06-13 15:19:13+02:00 1.156600 \n", - "2017-06-13 15:29:43+02:00 1.352278 \n", + "2017-06-13 15:39:28+02:00 NaN \n", + "2017-06-13 15:40:13+02:00 2.169202 \n", "\n", - " EDA__quantile_0.75__w=30s EDA__rms__w=1m30s \\\n", - "timestamp \n", - "2017-06-13 15:19:13+02:00 1.164273 1.191974 \n", - "2017-06-13 15:29:43+02:00 1.368264 1.439992 \n", + " EDA__quantile_0.75__w=30s EDA__rms__w=30s \\\n", + "timestamp \n", + "2017-06-13 15:39:28+02:00 NaN 2.297914 \n", + "2017-06-13 15:40:13+02:00 2.221959 2.179614 \n", "\n", - " EDA__skew__w=1h ... TMP__kurtosis__w=1m \\\n", - "timestamp ... \n", - "2017-06-13 15:19:13+02:00 NaN ... -0.810092 \n", - "2017-06-13 15:29:43+02:00 0.933501 ... -0.514506 \n", + " EDA__skew__w=30s ... TMP__kurtosis__w=1m \\\n", + "timestamp ... \n", + "2017-06-13 15:39:28+02:00 -2.049654 ... NaN \n", + "2017-06-13 15:40:13+02:00 0.264285 ... -1.026206 \n", "\n", " TMP__mean__w=1m TMP__quantile_0.25__w=1m \\\n", "timestamp \n", - "2017-06-13 15:19:13+02:00 34.975834 34.950001 \n", - "2017-06-13 15:29:43+02:00 33.743000 33.730000 \n", + "2017-06-13 15:39:28+02:00 NaN NaN \n", + "2017-06-13 15:40:13+02:00 34.577831 34.549999 \n", "\n", " TMP__quantile_0.5__w=1m TMP__quantile_0.75__w=1m \\\n", "timestamp \n", - "2017-06-13 15:19:13+02:00 34.990002 34.990002 \n", - "2017-06-13 15:29:43+02:00 33.740000 33.750000 \n", + "2017-06-13 15:39:28+02:00 NaN NaN \n", + "2017-06-13 15:40:13+02:00 34.57 34.610001 \n", "\n", " TMP__rms__w=1m TMP__skew__w=1m TMP__slope__w=1m \\\n", "timestamp \n", - "2017-06-13 15:19:13+02:00 34.975845 -0.058798 -0.000345 \n", - "2017-06-13 15:29:43+02:00 33.743008 0.442661 0.000050 \n", + "2017-06-13 15:39:28+02:00 NaN NaN NaN \n", + "2017-06-13 15:40:13+02:00 34.577858 0.256684 -0.000537 \n", "\n", " TMP__std__w=1m TMP__var__w=1m \n", "timestamp \n", - "2017-06-13 15:19:13+02:00 0.028884 0.000834 \n", - "2017-06-13 15:29:43+02:00 0.022753 0.000518 \n", + "2017-06-13 15:39:28+02:00 NaN NaN \n", + "2017-06-13 15:40:13+02:00 0.040788 0.001664 \n", "\n", "[2 rows x 26 columns]" ] @@ -1396,7 +2000,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -1404,7 +2007,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -1789,7 +2391,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -1797,7 +2398,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": { "tags": [] @@ -1816,7 +2416,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -1837,7 +2436,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": { "tags": [] @@ -1847,7 +2445,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -1865,7 +2462,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": { "tags": [] @@ -1875,7 +2471,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -1973,7 +2568,7 @@ "hash": "bb9f799bd6e477ee22986cb3017f2b22764a696a803a513374640b5de3a6fdad" }, "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -1987,7 +2582,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.6" + "version": "3.11.3" }, "metadata": { "interpreter": { diff --git a/tests/test_features_feature_collection.py b/tests/test_features_feature_collection.py index ea931dc8..5ec526ff 100644 --- a/tests/test_features_feature_collection.py +++ b/tests/test_features_feature_collection.py @@ -2326,3 +2326,229 @@ def test_feature_collection_various_timezones_segment_start_idxs(): s_usa, segment_start_idxs=s_none.index[:3].values, n_jobs=0, return_df=True ) assert np.all(res.values == []) + + +@pytest.mark.parametrize( + "test_setup", + [ + # (start_idx, exact_time, stride, window, expected_result) + (12, 5, None, None, 15), + (10, 5, None, None, 10), + (10, 2.5, None, None, 10), + (3, 2.5, None, None, 5.0), + (7, True, None, None, 7), + (10, False, None, 2, 10), + (10, False, None, 3, 12), + (100, False, 7, 3, 105), + (100, False, [7, 9, 11], 3, 693), + (100, False, [3.5, 7.0, 9.4], 3, 987), + ], +) +def test_process_non_exact_start_idx_int(test_setup): + start_idx, exact_time, stride, window, expected_result = test_setup + result = FeatureCollection._process_non_exact_start_idx( + start_idx, exact_time, stride, window + ) + + assert result == expected_result + + assert isinstance(result, (int, float, np.int32, np.int64)) + + +@pytest.mark.parametrize( + "test_setup", + [ + # (start_idx, exact_time, stride, window, expected_result) + (32.5, 12.4, None, None, 37.2), + (7.5, 2.5, None, None, 7.5), + (7.5, 5, None, None, 10.0), + (31.0, 21.29485, None, None, 42.5897), + (21.395, True, None, None, 21.395), + (21.395, False, 5, 7, 35.0), + (21.395, False, None, 20, 40.0), + (21.395, False, None, 17.475, 34.95), + (21.395, False, [2.5, 3.4, 5.0], 17, 85.0), + ], +) +def test_process_non_exact_start_idx_float(test_setup): + start_idx, exact_time, stride, window, expected_result = test_setup + result = FeatureCollection._process_non_exact_start_idx( + start_idx, exact_time, stride, window + ) + + assert result == expected_result + + assert isinstance(result, float) + + +@pytest.mark.parametrize( + "test_setup", + [ + # (start_idx, exact_time, stride, window, expected_result) + ( + pd.Timestamp("2019-01-01T09:00:00+0100"), + "1D", + None, + None, + pd.Timestamp("2019-01-02T00:00:00+0100"), + ), + ( + pd.Timestamp("2019-01-01T09:00:00+0100"), + "1h", + None, + None, + pd.Timestamp("2019-01-01T09:00:00+0100"), + ), + ( + pd.Timestamp("2019-01-01T10:08:55+0100"), + "2h", + None, + None, + pd.Timestamp("2019-01-01T12:00:00+0100"), + ), + ( + pd.Timestamp("2019-01-01T09:08:16+0100"), + "15min", + None, + None, + pd.Timestamp("2019-01-01T09:15:00+0100"), + ), + ( + pd.Timestamp("2019-01-01T09:00:00+0100"), + pd.Timedelta(1, "day"), + None, + None, + pd.Timestamp("2019-01-02T00:00:00+0100"), + ), + ( + pd.Timestamp("2019-01-01T09:00:00+0100"), + pd.Timedelta(1, "hour"), + None, + None, + pd.Timestamp("2019-01-01T09:00:00+0100"), + ), + ( + pd.Timestamp("2019-01-01T10:08:55+0100"), + pd.Timedelta(2, "hours"), + None, + None, + pd.Timestamp("2019-01-01T12:00:00+0100"), + ), + ( + pd.Timestamp("2019-01-01T09:08:16+0100"), + pd.Timedelta(15, "minutes"), + None, + None, + pd.Timestamp("2019-01-01T09:15:00+0100"), + ), + ( + pd.Timestamp("2019-01-01T10:08:55+0100"), + True, + None, + None, + pd.Timestamp("2019-01-01T10:08:55+0100"), + ), + ( + pd.Timestamp("2019-01-01T09:00:00+0100"), + False, + None, + pd.Timedelta(1, "hour"), + pd.Timestamp("2019-01-01T09:00:00+0100"), + ), + ( + pd.Timestamp("2019-01-01T09:00:00+0100"), + False, + pd.Timedelta(2, "hours"), + pd.Timedelta(45, "minutes"), + pd.Timestamp("2019-01-01T12:00:00+0100"), + ), + ( + pd.Timestamp("2019-01-01T10:08:55+0100"), + False, + [ + pd.Timedelta(20, "minutes"), + pd.Timedelta(30, "minutes"), + pd.Timedelta(60, "minutes"), + ], + pd.Timedelta(10, "minutes"), + pd.Timestamp("2019-01-01T11:00:00+0100"), + ), + ], +) +def test_process_non_exact_start_idx_timestamp(test_setup): + start_idx, exact_time, stride, window, expected_result = test_setup + result = FeatureCollection._process_non_exact_start_idx( + start_idx, exact_time, stride, window + ) + + assert result == expected_result + + assert isinstance(result, pd.Timestamp) + + +@pytest.mark.parametrize( + "test_setup", + [ + # (start_idx, stride) + (100, None), + (100, [7, 9, 11]), + (21.395, None), + (21.395, [2.5, 5.0]), + (pd.Timestamp("2019-01-01T10:08:55+0100"), None), + ( + pd.Timestamp("2019-01-01T10:08:55+0100"), + [ + pd.Timedelta(20, "minutes"), + pd.Timedelta(30, "minutes"), + pd.Timedelta(60, "minutes"), + ], + ), + ], +) +def test_process_non_exact_start_idx_window_none(test_setup): + start_idx, stride = test_setup + with pytest.raises(AssertionError, match=r".*window argument is required.*"): + FeatureCollection._process_non_exact_start_idx(start_idx, False, stride, None) + + +@pytest.mark.parametrize( + "test_setup", + [ + # (start_idx, exact_time, stride, window) + (5, False, None, pd.Timedelta(10, "minutes")), + (5, False, 7, pd.Timedelta(10, "minutes")), + (5, False, 7.95, pd.Timedelta(10, "minutes")), + (5, False, pd.Timedelta(20, "minutes"), pd.Timedelta(10, "minutes")), + (5.25, False, None, pd.Timedelta(10, "minutes")), + (5.25, False, 7, pd.Timedelta(10, "minutes")), + (5.25, False, 7.95, pd.Timedelta(10, "minutes")), + (5.25, False, pd.Timedelta(20, "minutes"), pd.Timedelta(10, "minutes")), + (pd.Timestamp("2019-01-01T10:08:55+0100"), False, None, 5), + (pd.Timestamp("2019-01-01T10:08:55+0100"), False, None, 5.75), + (pd.Timestamp("2019-01-01T10:08:55+0100"), False, 7, 5), + (pd.Timestamp("2019-01-01T10:08:55+0100"), False, 7.95, 5.75), + ( + pd.Timestamp("2019-01-01T10:08:55+0100"), + False, + pd.Timedelta(10, "minutes"), + 5, + ), + ( + pd.Timestamp("2019-01-01T10:08:55+0100"), + False, + 7.95, + pd.Timedelta(10, "minutes"), + ), + ], +) +def test_process_non_exact_start_idx_incorrect_types(test_setup): + start_idx, exact_time, stride, window = test_setup + with pytest.raises((AssertionError, ValueError)): + FeatureCollection._process_non_exact_start_idx( + start_idx, exact_time, stride, window + ) + + +def test_process_non_exact_start_idx_unsupported_type(): + with pytest.raises(TypeError, match=r".*not supported as `exact_time` argument.*"): + FeatureCollection._process_non_exact_start_idx(2, dict(), None, None) diff --git a/tsflex/features/feature_collection.py b/tsflex/features/feature_collection.py index b4ed870a..b31f31be 100644 --- a/tsflex/features/feature_collection.py +++ b/tsflex/features/feature_collection.py @@ -12,10 +12,12 @@ __author__ = "Jonas Van Der Donckt, Emiel Deprost, Jeroen Van Der Donckt" +import math import os import traceback import uuid from copy import deepcopy +from datetime import datetime from pathlib import Path from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union @@ -26,7 +28,7 @@ from tqdm.auto import tqdm from ..features.function_wrapper import FuncWrapper -from ..utils.attribute_parsing import AttributeParser +from ..utils.attribute_parsing import AttributeParser, DataType from ..utils.data import flatten, to_list, to_series_list from ..utils.logging import add_logging_handler, delete_logging_handlers from ..utils.time import parse_time_arg, timedelta_to_str @@ -275,6 +277,7 @@ def _stroll_feat_generator( segment_start_idxs: Union[np.ndarray, None], segment_end_idxs: Union[np.ndarray, None], start_idx: Any, + exact_time, end_idx: Any, window_idx: str, include_final_window: bool, @@ -297,6 +300,11 @@ def get_stroll_function(idx) -> Tuple[StridedRolling, FuncWrapper]: ] stride = feature.stride if calc_stride is None else calc_stride function: FuncWrapper = feature.function + + cleaned_start_idx = self._process_non_exact_start_idx( + start_idx, exact_time, calc_stride, win + ) + # The factory method will instantiate the right StridedRolling object stroll_arg_dict = dict( data=[series_dict[k] for k in key], @@ -304,7 +312,7 @@ def get_stroll_function(idx) -> Tuple[StridedRolling, FuncWrapper]: strides=stride, segment_start_idxs=segment_start_idxs, segment_end_idxs=segment_end_idxs, - start_idx=start_idx, + start_idx=cleaned_start_idx, end_idx=end_idx, window_idx=window_idx, include_final_window=include_final_window, @@ -341,6 +349,164 @@ def _process_segment_idxs( segment_idxs = segment_idxs.squeeze() # remove singleton dimensions return segment_idxs + @staticmethod + def _process_non_exact_start_idx( + start_idx: Union[pd.Timestamp, float, int], + exact_time: Union[bool, str, pd.Timedelta, int, float], + stride: Optional[Union[float, str, pd.Timedelta, List, None]] = None, + window: Optional[Union[float, str, pd.Timedelta]] = None, + ) -> Union[pd.Timestamp, float, int]: + """Round start idx according to value of `exact_time` parameter. + + Parameters + ---------- + start_idx : pd.Timestamp + exact_time : Union[bool, str, pd.Timedelta, int, float] + How to perform the start index rounding. This argument supports multiple types:\n + * If the type is a `bool`, rounding resolution will be calculated using + least common multiple of stride and window. + * If the type is a `str`, the string must represent a frequency string indicating + the rounding resolution. Hence, the **passed data must have a time-index**. + * If the type is an `float` or an `int`, its value represents the series:\n + - its stride **range** when a **non time-indexed** series is passed. + * If the exact_time's type is a `pd.Timedelta`, the exact_time size represents + the exact_time-time delta. The passed data **must have a time-index**. + start_idx is rounded to multiple of exact_time, using ceiling rounding. + stride: Union[float, str, pd.Timedelta, List[Union[float, str, pd.Timedelta], None], optional + The stride size. By default None. This argument supports multiple types: \n + * If None, the stride of the `FeatureDescriptor` objects will be used. + * If the type is an `float` or an `int`, its value represents the series:\n + - its stride **range** when a **non time-indexed** series is passed. + - the stride in **number of samples**, when a **time-indexed** series + is passed (must then be and `int`) + * If the stride's type is a `pd.Timedelta`, the stride size represents + the stride-time delta. The passed data **must have a time-index**. + * If a `str`, it must represent a stride-time-delta-string. Hence, the + **passed data must have a time-index**. \n + window : Union[float, str, pd.Timedelta], optional + + Returns + ------- + start index rounded to `exact_time`. + return value type depends on start_idx type + + .. note:: + ```md + | index datatype | rounding datatype | return datatype | extra info | + | :------------- | :---------------- | --------------: | :---------------------------------------------------------------- | + | int | int | int | round `index` to nearest multiple of `rounding` | + | | float | float | round `index` to nearest multiple of `rounding` | + | | bool | float | round `index` to LCM of `window` and/or `stride` | + | float | int | float | round `index` to nearest multiple of `rounding` | + | | float | float | round `index` to nearest multiple of `rounding` | + | | bool | float | round `index` to LCM of `window` and/or `stride` | + | pd.Timestamp | str | pd.Timestamp | round `index` to resolution of `rounding` (e.g. '10s', '2m', 'H') | + | | bool | pd.Timestamp | round `index` to LCM of `window` and/or `stride` | + | | pd.Timedelta | pd.Timestamp | round `index` to nearest multiple of `rounding`. | + ``` + """ + + def numeric_ceil(index, round_to): + int_div = index // round_to + ceil_offset = int((index % round_to) != 0) * round_to + return (int_div * round_to) + ceil_offset + + def dt_ceil(dt, delta): + start_time_py = start_idx.to_pydatetime() + delta_py_time = delta.to_pytimedelta() + dt_min = datetime.min + dt_min = dt_min.replace(tzinfo=dt.tzinfo) + dt_ceil = ( + dt_min + + math.ceil((start_time_py - dt_min) / delta_py_time) * delta_py_time + ) + return pd.Timestamp(dt_ceil) + + def is_float(x) -> bool: + xt = type(x) + return isinstance(x, float) or np.issubdtype(xt, np.floating) + + def is_int(x) -> bool: + xt = type(x) + return isinstance(x, int) or np.issubdtype(xt, np.integer) + + def is_numeric(x) -> bool: + return is_float(x) or is_int(x) + + exact_time_type = type(exact_time) + if isinstance(exact_time, bool): + # default case, exact_time is True by default + if exact_time: + return start_idx + # calculate LCM of window & stride + assert ( + window is not None + ), "if exact_time is `False`, then the window argument is required" + if stride is None: + strides = [] + elif isinstance(stride, list) or isinstance(stride, np.ndarray): + strides = [s for s in stride] + else: + strides = [stride] + + # as we need the LCM of the whole list, we can just append the window + lcm_values = strides + [window] + + arg_dtype = AttributeParser.determine_type(lcm_values) + + # should not happen, but extra check to cover the possibilities + assert arg_dtype != DataType.UNDEFINED + + if arg_dtype == DataType.SEQUENCE: + idx_dtype = AttributeParser.determine_type(start_idx) + assert idx_dtype == DataType.SEQUENCE + # numeric LCM + if is_int(lcm_values[0]): + lcm = np.lcm.reduce(lcm_values) + else: + # slightly ugly way to perform LCM on floats + PRECISION = 5 + for idx, val in enumerate(lcm_values): + rounded_val = np.round(val, decimals=PRECISION) + lcm_values[idx] = int(rounded_val * 10**PRECISION) + lcm_val = np.lcm.reduce(lcm_values) + lcm = lcm_val / 10**PRECISION + + # round start_idx to nearest value of lcm + return numeric_ceil(start_idx, lcm) + + else: + assert isinstance(start_idx, pd.Timestamp) + # transform to timedeltas and use numeric LCM on asm8 nanoseconds + for idx, val in enumerate(lcm_values): + parsed_time = parse_time_arg(val) + lcm_values[idx] = parsed_time.asm8.astype(np.int64) + + lcm_ns = np.lcm.reduce(lcm_values) + lcm_timedelta = pd.Timedelta(lcm_ns) + + ceiltime_timestamp = dt_ceil(start_idx, lcm_timedelta) + return ceiltime_timestamp + + elif is_float(exact_time): + assert is_numeric(start_idx) + return numeric_ceil(float(start_idx), exact_time) + elif is_int(exact_time): + assert is_numeric(start_idx) + return numeric_ceil(start_idx, exact_time) + elif isinstance(exact_time, str): + assert isinstance(start_idx, pd.Timestamp) + # should be normal time offset string + return start_idx.ceil(exact_time) + elif isinstance(exact_time, pd.Timedelta): + assert isinstance(start_idx, pd.Timestamp) + rounded_timestamp = dt_ceil(start_idx, exact_time) + return rounded_timestamp + else: + raise TypeError( + f"type: {exact_time_type} is not supported as `exact_time` argument - {exact_time}" + ) + def calculate( self, data: Union[pd.Series, pd.DataFrame, List[Union[pd.Series, pd.DataFrame]]], @@ -349,6 +515,7 @@ def calculate( Union[list, np.ndarray, pd.Series, pd.Index] ] = None, segment_end_idxs: Optional[Union[list, np.ndarray, pd.Series, pd.Index]] = None, + exact_time: Optional[Union[bool, str, pd.Timedelta, int, float]] = True, return_df: Optional[bool] = False, window_idx: Optional[str] = "end", include_final_window: Optional[bool] = False, @@ -419,6 +586,31 @@ def calculate( As such, the user can create variable-length segmented windows. However, in such cases, the user should be weary that the feature functions are invariant to these (potentially variable-length) windows. + exact_time: Optional[Union[bool, str, pd.Timedelta, int, float]], optional + Perform the start index rounding. This argument supports multiple types:\n + * If the type is a `bool`, rounding resolution will be calculated using + least common multiple of stride and window. + * If the type is a `str`, the string must represent a frequency string indicating + the rounding resolution. Hence, the **passed data must have a time-index**. + * If the type is an `float` or an `int`, its value represents the series:\n + - its stride **range** when a **non time-indexed** series is passed. + * If the exact_time's type is a `pd.Timedelta`, the exact_time size represents + the exact_time-time delta. The passed data **must have a time-index**. + start_idx is rounded to multiple of exact_time, using ceiling rounding. + + The functioning of this parameter can be described by the following table: + + | index datatype | rounding datatype | return datatype | extra info | + | :------------- | :---------------- | --------------: | ----------------------------------------------------------------: | + | int | int | int | round `index` to nearest multiple of `rounding` | + | | float | float | round `index` to nearest multiple of `rounding` | + | | bool | float | round `index` to LCM of `window` and/or `stride` | + | float | int | float | round `index` to nearest multiple of `rounding` | + | | float | float | round `index` to nearest multiple of `rounding` | + | | bool | float | round `index` to LCM of `window` and/or `stride` | + | pd.Timestamp | str | pd.Timestamp | round `index` to resolution of `rounding` (e.g. '10s', '2m', 'H') | + | | bool | pd.Timestamp | round `index` to LCM of `window` and/or `stride` | + | | pd.Timedelta | pd.Timestamp | round `index` to nearest multiple of `rounding`. | return_df : bool, optional Whether the output needs to be a DataFrame or a list thereof, by default False. If `True` the output dataframes will be merged to a DataFrame with an @@ -592,6 +784,7 @@ def calculate( # Determine the bounds of the series dict items and slice on them # TODO: is dit wel nodig `hier? want we doen dat ook in de strided rolling start, end = _determine_bounds(bound_method, list(series_dict.values())) + series_dict = { n: s.loc[ s.index.dtype.type(start) : s.index.dtype.type(end) @@ -608,6 +801,7 @@ def calculate( segment_start_idxs=segment_start_idxs, segment_end_idxs=segment_end_idxs, start_idx=start, + exact_time=exact_time, end_idx=end, window_idx=window_idx, include_final_window=include_final_window,