Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 21 additions & 45 deletions quantipy/core/tools/dp/dimensions/reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
'6': 'float',
'7': 'boolean'
}
RE_GRID_SLICES = "[^{.]+(?=[}]|$|\[)"
RE_GRID_SLICES = r'[^{.]+(?=[}]|$|\[)'
XPATH_DEFINITION = '//definition'
XPATH_VARIABLES = '//design//fields//variable'
XPATH_LOOPS = '//design//fields//loop'
Expand Down Expand Up @@ -81,10 +81,10 @@ def ddf_to_pandas(path_ddf):
levels = sql['Levels']
table_name_map = dict(levels['DSCTableName'])
table_name_map['L1'] = 'HDATA'
level_id_map = {}
# level_id_map = {}
new_levels_index = ['HDATA']
for table_name in levels.index[1:]:
new_table_name = levels.ix[table_name,'DSCTableName']
new_table_name = levels.loc[table_name,'DSCTableName']
ddf[new_table_name] = sql[table_name]
new_levels_index.append(new_table_name)

Expand All @@ -97,11 +97,9 @@ def ddf_to_pandas(path_ddf):

return ddf

def timestamp_to_ISO8610(timestamp, offset_date="1900-01-01",as_string=False, adjuster=None):

def timestamp_to_ISO8610(timestamp, offset_date="1900-01-01",
as_string=False, adjuster=None):

offset = np.datetime64(offset_date).astype("float") * DAYS_TO_MS
offset = np.datetime64(offset_date).astype("float64") * DAYS_TO_MS
day = timestamp * DAYS_TO_MS
date = (day + offset).astype("datetime64[ms]")
if not adjuster is None:
Expand All @@ -111,7 +109,6 @@ def timestamp_to_ISO8610(timestamp, offset_date="1900-01-01",

return date


def get_datetime_values(var_df, adjuster, as_string=True):

dates = var_df.astype(float).apply(
Expand All @@ -126,7 +123,6 @@ def get_datetime_values(var_df, adjuster, as_string=True):
else:
return dates


def quantipy_clean(ddf):

clean = {}
Expand Down Expand Up @@ -181,7 +177,7 @@ def quantipy_clean(ddf):
# Coerce column dtypes for expected Quantipy usage
# methods and functions by type
if 'single' in types_df.index:
columns = types_df.ix['single','column']
columns = types_df.loc['single','column']
if isinstance(columns, str):
columns = [columns]
for column in columns:
Expand All @@ -199,7 +195,7 @@ def quantipy_clean(ddf):
ddf[n_tab][column].replace(-1, np.NaN, inplace=True)

if 'date' in types_df.index:
columns = types_df.ix['date','column']
columns = types_df.loc['date','column']
if isinstance(columns, str):
columns = [columns]
for column in columns:
Expand All @@ -210,7 +206,7 @@ def quantipy_clean(ddf):
)

if 'boolean' in types_df.index:
columns = types_df.ix['boolean','column']
columns = types_df.loc['boolean','column']
if isinstance(columns, str):
columns = [columns]
for column in columns:
Expand All @@ -220,14 +216,12 @@ def quantipy_clean(ddf):

return clean, ddf['Levels']


def force_single_from_delimited(data):

data = data.apply(lambda x: x.str.replace(';', ''))
data = data.convert_objects(convert_numeric=True)
return data


def as_L1(child, parent=None, force_single=False):

if parent is None:
Expand Down Expand Up @@ -294,7 +288,6 @@ def as_L1(child, parent=None, force_single=False):

return child_as_L1


def get_var_type(var):

mdd_type = MDD_TYPES_MAP[var.get('type')]
Expand All @@ -306,7 +299,6 @@ def get_var_type(var):

return mdd_type


def get_text_dict(source):

text = {
Expand All @@ -318,7 +310,6 @@ def get_text_dict(source):
text[tk] = ""
return text


def get_meta_values(xml, column, data, map_values=True):

if '.' in column['name']:
Expand All @@ -330,7 +321,7 @@ def get_meta_values(xml, column, data, map_values=True):
var_name = column['name']

column_values = []
column_factors = []
# column_factors = []

if is_grid:
# this protects against the scenario where multiple grids
Expand All @@ -345,7 +336,6 @@ def get_meta_values(xml, column, data, map_values=True):
field_ref = field.get('ref')
xpath_var = XPATH_DEFINITION+"//variable[@id='"+field_ref+"']"
xpath_categories = xpath_var+"//categories//category"

else:
xpath_var = XPATH_DEFINITION+"//variable[@name='"+var_name+"']"
xpath_categories = xpath_var+"//categories//category"
Expand Down Expand Up @@ -425,14 +415,14 @@ def get_meta_values(xml, column, data, map_values=True):
values.append(int(v))
msg = 'Null in category values for {} will be replaced with empty value.'.format(
var_name)
except Exception as e:
except Exception:
values = range(1, len(categories)+1)
msg = 'NULL in values for {} will be replaced with empty value'.format(var_name)
warnings.warn(msg)
else:
values = list(range(1, len(categories)+1))
msg = 'Category values for {} will be taken byPosition'.format(var_name)
warnings.warn(msg)
# warnings.warn(msg) ' Uncomment this before shipping out

# handy trouble-shooting printout for figuring out where category values
# have come from.
Expand Down Expand Up @@ -474,7 +464,6 @@ def get_meta_values(xml, column, data, map_values=True):

return column_values, value_map


def remap_values(data, column, value_map):
if column['type'] in ['single']:
missing = [
Expand Down Expand Up @@ -505,7 +494,6 @@ def remap_values(data, column, value_map):

return False


def map_delimited_values(y, value_map, col_name):
"""
Map the delimited values using the given mapper, dropping unknown responses.
Expand Down Expand Up @@ -545,15 +533,14 @@ def map_delimited_values(y, value_map, col_name):

return y


def begin_column(xml, col_name, data):

column = {}

xpath_var = XPATH_DEFINITION+"//variable[@name='"+col_name+"']"
try:
var = xml.xpath(xpath_var)[0]
except Exception as e:
except Exception:
column['name'] = col_name
column['properties'] = get_meta_properties(xml, xpath_var)
column['type'] = 'string'
Expand All @@ -574,7 +561,6 @@ def begin_column(xml, col_name, data):

return column


def get_meta_properties(xml, xpath_var, exclude=None):

if exclude is None:
Expand All @@ -597,7 +583,6 @@ def get_meta_properties(xml, xpath_var, exclude=None):

return properties


def map_cols_from_grid(xml, data):

needs_mapping = False
Expand Down Expand Up @@ -644,7 +629,6 @@ def map_cols_from_grid(xml, data):

return data


def get_mdd_xml(path_mdd):

#with open(path_mdd, 'r+') as f:
Expand All @@ -655,7 +639,6 @@ def get_mdd_xml(path_mdd):

return xml


def get_grid_elements(xml, grid_name):

xpath_elements = XPATH_LOOPS+"[@name='"+grid_name+"']//categories"
Expand All @@ -668,7 +651,6 @@ def get_grid_elements(xml, grid_name):

return elements, xpath_elements


def get_columns_meta(xml, meta, data, map_values=True):

columns = {}
Expand Down Expand Up @@ -702,7 +684,7 @@ def get_columns_meta(xml, meta, data, map_values=True):
)

if not mm_name in meta['masks']:
# xpath_grid = "//design//grid[@name='%s']" % mm_name
# xpath_grid = "//design//grid[@name='%s']" % mm_name
xpath_grid = "//design//grid[@name='%s']" % mm_name.split('.')[0]
if not xml.xpath(xpath_grid):
xpath_grid = "//design//loop[@name='%s']" % mm_name.split('.')[0]
Expand Down Expand Up @@ -768,7 +750,6 @@ def get_columns_meta(xml, meta, data, map_values=True):

return meta, columns, data


def mdd_to_quantipy(path_mdd, data, map_values=True):

meta = {}
Expand Down Expand Up @@ -949,10 +930,10 @@ def mdd_to_quantipy(path_mdd, data, map_values=True):
for item in meta['sets'][k]['items']
if item in mask_items
]
# meta['masks'][k]['items'] = [
# {'source': i}
# for i in meta['sets'][k]['items']
# ]
# meta['masks'][k]['items'] = [
# {'source': i}
# for i in meta['sets'][k]['items']
# ]

meta['sets']['data file']['items'] = updated_design_set

Expand All @@ -964,19 +945,17 @@ def mdd_to_quantipy(path_mdd, data, map_values=True):

return meta, data


def get_mask_item(mask, source, k):
for item in mask['items']:
if item['source']==source:
return item


def quantipy_from_dimensions(path_mdd, path_ddf, fields='all', grids=None):

ddf, levels = quantipy_clean(ddf_to_pandas(path_ddf))
L1 = ddf['HDATA'].copy()
L1.drop('LevelId_HDATA', axis=1, inplace=True)
# L1.dropna(axis=1, how='all', inplace=True)
# L1.dropna(axis=1, how='all', inplace=True)

if isinstance(fields, (list, tuple)):
L1 = L1[['id_HDATA']+fields]
Expand All @@ -990,7 +969,7 @@ def quantipy_from_dimensions(path_mdd, path_ddf, fields='all', grids=None):
empty_grids = []
for grid_name in grids:
if not any(levels['ParentName'].isin([grid_name])):
parent_name = levels.loc[grid_name, 'ParentName']
# parent_name = levels.loc[grid_name, 'ParentName']
if grid_name in list(ddf.keys()):
single_level.append(as_L1(child=ddf[grid_name]))
else:
Expand Down Expand Up @@ -1040,18 +1019,16 @@ def quantipy_from_dimensions(path_mdd, path_ddf, fields='all', grids=None):
datafile.remove(item)
meta['sets']['data file']['items'] = datafile


for key, col in meta['columns'].items():
if col['type']=='string' and key in ddf:
ddf[key] = ddf[key].apply(qp.core.tools.dp.io.unicoder)
if col['type']=='int' and key in ddf:
ddf[key] = ddf[key].replace('null', 0)

mdd, ddf = verify_columns(meta, ddf)
meta, ddf = verify_columns(meta, ddf)

return meta, ddf


def verify_columns(mdd, ddf):
"""
Ensure all columns in the data appear in the meta.
Expand All @@ -1068,7 +1045,6 @@ def verify_columns(mdd, ddf):

return mdd, ddf


def order_by_meta(data, columns, masks):
"""
Check and re-order data.columns against meta['sets']['data file']['items'].
Expand All @@ -1085,5 +1061,5 @@ def _get_column_items(columns, masks):
return result
new_order = ["id_L1"]
new_order.extend(_get_column_items(columns, masks))
#data = data.ix[:, new_order]
#data = data.loc[:, new_order]
return data