Source code for core.preprocessing.outlier

#!/usr/bin/env python3
# -*- coding: utf-8 -*-


[docs]def remove_outlier(dataset, auto_fill=True, ratio=1.5): """ Remove potential outliers using IQR, and fill with numpy.nan. Outliers are the value that less than value at the first quantile - ratio * IOR or greater than value at the third quantile + ratio * IOR in its corresponding column :parameter dataset: Dataset object that wants to remove outliers :type dataset: core.data.dataset.Dataset :parameter auto_fill: whether automatically fill the outliers or leave it nan :type auto_fill: bool :parameter ratio: IQR ratio in order to mark value as an outlier :type ratio: float :return: None """ from ..data import Dataset from .fill import fill from numpy import percentile, isnan, nonzero, nan if not isinstance(dataset, Dataset): raise TypeError("Dataset has to be class core.data.dataset.Dataset") data = dataset.data time_col = dataset.time_column_index for i in range(data.shape[1]): if i == time_col: continue column = nonzero(~isnan(data[:, i]))[0] q25, q75 = percentile(data[column, i], (25, 75)) iqr = q75 - q25 data[column[nonzero(data[column, i] < (q25 - ratio * iqr))[0]], i] = nan data[column[nonzero(data[column, i] > (q75 + ratio * iqr))[0]], i] = nan dataset.change_values(data) if auto_fill: fill(dataset)