diff --git a/toolkit.py b/toolkit.py index d366a560728d9ec1f5f95f93cd88f13f62a6012f..29adbfb84cef516a9dc6eb3259b5a60690f75655 100755 --- a/toolkit.py +++ b/toolkit.py @@ -559,6 +559,14 @@ class ClassificationProject(object): return self._scaler + def transform(self, x): + return self.scaler.transform(x) + + + def inverse_transform(self, x): + return self.scaler.inverse_transform(x) + + @property def history(self): params_file = os.path.join(self.project_dir, "history_params.json") @@ -592,7 +600,6 @@ class ClassificationProject(object): def _transform_data(self): if not self.data_transformed: - # todo: what to do about the outliers? Where do they come from? if logger.level <= logging.DEBUG: logger.debug("training data before transformation: {}".format(self.x_train)) logger.debug("minimum values: {}".format([np.min(self.x_train[:,i][~np.isnan(self.x_train[:,i])]) @@ -929,7 +936,7 @@ class ClassificationProject(object): def evaluate(self, x_eval, mode=None): logger.debug("Evaluate score for {}".format(x_eval)) - x_eval = self.scaler.transform(x_eval) + x_eval = self.transform(x_eval) logger.debug("Evaluate for transformed array: {}".format(x_eval)) return self.predict(x_eval, mode=mode) @@ -1697,12 +1704,30 @@ class ClassificationProjectRNN(ClassificationProject): eval_score("train") + def _batch_transform(self, x, fn, batch_size): + "Transform array in batches, temporarily setting mask_values to nan" + transformed = np.empty(len(x)) + for start in range(0, len(x), batch_size): + stop = start+batch_size + x_batch = np.array(x[start:stop]) # copy + x_batch[x_batch == self.mask_value] = np.nan + x_batch = fn(x_batch) + x_batch[np.isnan(x_batch)] = self.mask_value + transformed[start:stop] = x_batch + return transformed + + + def transform(self, x, batch_size=10000): + return self._batch_transform(x, self.scaler.transform, batch_size) + + + def inverse_transform(self, x, batch_size=10000): + return self._batch_transform(x, self.scaler.inverse_transform, batch_size) + + def evaluate(self, x_eval, mode=None): logger.debug("Evaluate score for {}".format(x_eval)) - x_eval = np.array(x_eval) # copy - x_eval[x_eval==self.mask_value] = np.nan - x_eval = self.scaler.transform(x_eval) - x_eval[np.isnan(x_eval)] = self.mask_value + x_eval = self.transform(x_eval) logger.debug("Evaluate for transformed array: {}".format(x_eval)) return self.predict(self.get_input_list(x_eval), mode=mode)