Source code for core.preprocessing.ontology

#!/usr/bin/env python3
# -*- coding: utf-8 -*-


[docs]def ontology(dataset): """ Update the name of features to standard glossary :parameter dataset: Dataset object or list of features' name that wants to map to standard glossary :type dataset: core.data.dataset.Dataset or list(str) :rtype: list(str) :return: Edited feature_list list """ from ..data import Dataset from Levenshtein import jaro dictionary = {"co2": ["co2", "carbon dioxide"], "humidity": ["humidity", "humidness", "wetness", "moisture"], "temperature": ["temp", "temperature", "indoor temperature", "environment temperature"], "out-temperature": ["outside temperature", "outside temp", "outdoor temp", "outdoor temperature"], "damper": ["damper", "damper position"], "voc": ["voc", "volative organic compounds"], "air": ["air", "air velocity", "wind"], "cloud": ["cloud", "cloud coverage", "cloud ratio"], "radiator": ["radiator value", "radval"], "pressure": ["pressure", "air pressure", "idoor pressure", "pa"], "light": ["light", "sun light", "brightness"]} header = dataset return_list = True if isinstance(dataset, Dataset): header = dataset.feature_list return_list = False if not isinstance(header, list): raise TypeError("Cannot recognize the feature_list") new_header = list() for word in header: maximum_score = 0 similar_word = '' for target_word in dictionary.keys(): target_word_score = 0 for possible_word in dictionary[target_word]: score = jaro(possible_word, word.lower()) if target_word_score < score: target_word_score = score if target_word_score > maximum_score: maximum_score = target_word_score similar_word = target_word if maximum_score > 0.9: new_header.append(similar_word) else: new_header.append(word) if not return_list: dataset.set_feature_name(new_header) return new_header