data-cleaning-project/functions.py at main · erfanbyt/data-cleaning-project · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
def data_generator(location, line_nums):
    """
      This function reads lines specified by the user
      - param location: location of the input file
      - param line_nums: desired line numbers

      - return: list of the desired lines
    """

    def get_lines(file, line_nums):
        """
          This function reads the desired lines from the input file
          - param file: input file object

          return: returns a list of desired lines without any modifications on them
        """
        return [x for i, x in enumerate(file) if i in line_nums]

    with open(location, 'r') as file:
        lines = get_lines(file, line_nums)
        desired_lines = []
        for line in lines:
            desired_lines.append(line.split())  # to remove separater (\t) from each line

    return desired_lines


def value_extractor(input_list):
    """
    The sampling time is repeated for each value for all the sensors - This function
    only extracts the values and return a list of values from all the sensors at each sampling time
    - param input_list: a list of data containing sampling time and values from each sensor at one sampling time

    - return: a list including only the values (NOT the sampling time)
    * Used in data_maker()
    """

    list_values = []
    for i in range(1, len(input_list) + 1, 2):
        list_values.append(input_list[i])

    return list_values


def data_maker(list_inputs):
    """
    Makes a dictionary of the data;
      key=sampling time -- value=data from the sensors at a specific time
    - param list_inputs: a nested list containing the elments of each line

    - return: a dictionary containing the data, columns as time samples
    """
    dict_data = {}
    for i in range(len(list_inputs)):
        label = list_inputs[i][0]
        values = value_extractor(list_inputs[i])
        dict_data[label] = values

    return dict_data


def column_namer(df, df_info):
    """
    assign proper name for each column of data from the dataframe containing information
    - param df: dataframe of sensers' data with sampling time as columns's name
    - param df_info: dataframe including the naming about the dataset in the 'Name' column

    - return: a dataframe with columns indicating the sensor's name and indexes as time sample
    """

    df = df.T  # transpose the dataframe
    df.columns = df_info['Name']  # setting the names for the columns

    return df


def column_renamer(df):
    """
    the naming convention includes the datatypes' first letter at the beginning of the name,
    this function rename the columns to an appropriate name
    - param: the dataframe with time samples as indexes and sensor's names as columns

    - return: a dataframe with proper naming for the columns
    """

    df.rename(columns=lambda x: x[1:], inplace=True)

    return df


def apply_data_type(df, df_info):
    """
      It assing a proper datatype for each column based on the information provided in the source file
      'BIT' --> Boolean
      'REAL32' --> float32
      'REAL64' --> float64
      'INT16' --> int16

      - param df: dataframe containing the sensor's data with proper naming for columns and rows
      - param df_info: dataframe containing the datatypes for each sensor in the 'Data-Type' column.

      - return: a dataframe with properly assigned data types and naming.
    """

    for col_name, data_type in zip(df.columns, df_info['Data-Type']):

        if data_type == 'BIT':
            df[col_name] = df[col_name].astype('int').astype('bool')

        if data_type == 'REAL32':
            df[col_name] = df[col_name].astype('float32')

        elif data_type == 'REAL64':
            df[col_name] = df[col_name].astype('float64')

        elif data_type == 'INT16':
            df[col_name] = df[col_name].astype('int16')

    return df


def mean_calculator(df):
    """
    This function skips the columns with boolean values and calculate the mean of the rest

    - param df: dataframe with properly assigned datatypes and naming

    - return: a dictionary of the averages -- {KEY='sensor's name': VALUE='average for each sensor over time'}
    """

    dict_averages = {}
    for col in df.columns:
        if df[col].dtype == 'bool':
            continue
        else:
            mean = df[col].mean()
            dict_averages[col] = mean

    return dict_averages


def data_reformater(dict_results):
    """
    JSON files only do not support float32, before making the JSON files, data with the format of
    float32 are converted to float64

    - param dict_results: a dictionary cotaining the average values
      {KEY='sensor's name': VALUE='average for each sensor over time'}

    - return: the modifed results with all numeric values as float64
    """
    for key, value in dict_results.items():
        if type(value) == np.float32:
            dict_results[key] = np.float64(value)

    return dict_results


import numpy as np
import pandas as pd
import json


def dict_combiner(dict_assignment_info, dict_data_specs):
    """
    Combines 2 dictionaries - the mean values for the sensors are stored as list of dictionaries with KEY='data specification'
    - param dict_assignment_info: dictionary containing the assignment's info
    - param dict_data_specs: dictionary containing the mean values

    - return: a dictionary containing information from both of the dictionaries.
    """

    dict_assignment_info['Data Specification'] = [dict_data_specs]

    return dict_assignment_info


def json_writer(python_dict):
    """
    Gets a python dict and writes a JSON file

    - param python_dict: a python dictionary
    """

    with open("results_Erfan.json", 'w') as file:
        json.dump(python_dict, file, indent=2)  # setting proper indentation to get a readable output!


def csv_writer(df):
    """
    Gets a dataframe and return the CSV file of if

    - param df: a pandas dataframe
    """

    df.to_csv("dataframe.csv")