Skip to content

Commit 15fba06

Browse files
committed
create_instances_from_lists and create_instances_from_matrices can force columns to be nominal now; load_csv_file can force columns to be nominal now
1 parent e9b4904 commit 15fba06

File tree

4 files changed

+103
-11
lines changed

4 files changed

+103
-11
lines changed

CHANGES.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ Changelog
66

77
- the methods `create_instances_from_lists` and `create_instances_from_matrices` of the
88
`weka.core.dataset` module can handle missing values now (`None` in case of lists,
9-
`nan` in case of matrices)
9+
`nan` in case of matrices), as well as being able to force columns to be nominal
1010
- added the method `load_csv_file` to the module `weka.core.converters` to provide a more
1111
reliable way of loading CSV files compared to Weka's native `CSVLoader` converter
1212
(uses Python's csv module and then calls `create_instances_from_lists`).

python/weka/core/converters.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -361,7 +361,7 @@ def ndarray_to_instances(array, relation, att_template="Att-#", att_list=None):
361361
return result
362362

363363

364-
def load_csv_file(filename, dialect="excel", delimiter=",", quotechar='"', num_cols=None):
364+
def load_csv_file(filename, dialect="excel", delimiter=",", quotechar='"', num_cols=None, nom_cols=None):
365365
"""
366366
Loads a CSV file using the Python csv module and then converts it
367367
to an Instances object. Better at reading CSV files than Weka's
@@ -379,6 +379,8 @@ def load_csv_file(filename, dialect="excel", delimiter=",", quotechar='"', num_c
379379
:param quoting: how the quoting works
380380
:param num_cols: the list of 0-based column indices that are numeric, default for cols is str
381381
:type num_cols: list
382+
:param nom_cols: the list of 0-based column indices that are nominal, default for cols is str
383+
:type nom_cols: list
382384
"""
383385
with open(filename) as fp:
384386
r = csv.reader(fp, dialect=dialect, delimiter=delimiter, quotechar=quotechar)
@@ -398,4 +400,4 @@ def load_csv_file(filename, dialect="excel", delimiter=",", quotechar='"', num_c
398400
row[num_col] = None
399401
data.append(row)
400402

401-
return create_instances_from_lists(data, cols_x=header, name=os.path.basename(filename))
403+
return create_instances_from_lists(data, cols_x=header, name=os.path.basename(filename), nominal_x=nom_cols)

python/weka/core/dataset.py

Lines changed: 75 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1747,7 +1747,7 @@ def check_col_names_unique(cols_x, col_y=None):
17471747
return None
17481748

17491749

1750-
def create_instances_from_lists(x, y=None, name="data", cols_x=None, col_y=None):
1750+
def create_instances_from_lists(x, y=None, name="data", cols_x=None, col_y=None, nominal_x=None, nominal_y=False):
17511751
"""
17521752
Allows the generation of an Instances object from a list of lists for X and a list for Y (optional).
17531753
Data can be numeric, string or bytes. Attributes can be converted to nominal with the
@@ -1764,6 +1764,10 @@ def create_instances_from_lists(x, y=None, name="data", cols_x=None, col_y=None)
17641764
:type cols_x: list
17651765
:param col_y: the column name to use for the output variable (y)
17661766
:type col_y: str
1767+
:param nominal_x: the list of 0-based column indices to treat as nominal ones, ignored if None
1768+
:type nominal_x: list
1769+
:param nominal_y: whether the y column is to be treated as nominal
1770+
:type nominal_y: bool
17671771
:return: the generated dataset
17681772
:rtype: Instances
17691773
"""
@@ -1783,6 +1787,16 @@ def create_instances_from_lists(x, y=None, name="data", cols_x=None, col_y=None)
17831787
if msg is not None:
17841788
raise Exception(msg)
17851789

1790+
# nominal x columns?
1791+
nominal_x_values = None
1792+
if nominal_x is not None:
1793+
nominal_x_values = dict()
1794+
for nominal_col in nominal_x:
1795+
labels = set()
1796+
for n in range(len(x)):
1797+
labels.add(typeconv.to_string(x[n][nominal_col]))
1798+
nominal_x_values[nominal_col] = sorted(list(labels))
1799+
17861800
# create header
17871801
atts = []
17881802
type_x = []
@@ -1792,6 +1806,9 @@ def create_instances_from_lists(x, y=None, name="data", cols_x=None, col_y=None)
17921806
for n in range(len(x)):
17931807
if x[n][i] is None:
17941808
continue
1809+
if (nominal_x_values is not None) and (i in nominal_x_values):
1810+
type_x[i] = "C"
1811+
break
17951812
if isinstance(x[n][i], float) or isinstance(x[n][i], int):
17961813
type_x[i] = "N"
17971814
break
@@ -1811,15 +1828,29 @@ def create_instances_from_lists(x, y=None, name="data", cols_x=None, col_y=None)
18111828
atts.append(Attribute.create_string(cols_x[i]))
18121829
elif type_x[i] == "S":
18131830
atts.append(Attribute.create_string(cols_x[i]))
1831+
elif type_x[i] == "C":
1832+
atts.append(Attribute.create_nominal(cols_x[i], nominal_x_values[i]))
18141833
else:
18151834
print("WARNING: Failed to determine data type for column #%d" % i)
18161835
atts.append(Attribute.create_numeric(cols_x[i]))
18171836

18181837
type_y = ""
18191838
if y is not None:
1839+
# nominal y column?
1840+
nominal_y_values = None
1841+
if nominal_y:
1842+
labels = set()
1843+
for n in range(len(y)):
1844+
labels.add(typeconv.to_string(y[n]))
1845+
nominal_y_values = sorted(list(labels))
1846+
18201847
for n in range(len(y)):
18211848
if y[n] is None:
18221849
continue
1850+
if nominal_y:
1851+
type_y = "C"
1852+
atts.append(Attribute.create_nominal(col_y, nominal_y_values))
1853+
break
18231854
if isinstance(y[n], float) or isinstance(y[n], int):
18241855
type_y = "N"
18251856
atts.append(Attribute.create_numeric(col_y))
@@ -1849,7 +1880,9 @@ def create_instances_from_lists(x, y=None, name="data", cols_x=None, col_y=None)
18491880
if type_x[n] == "N":
18501881
values.append(x[i][n])
18511882
elif type_x[n] == "B":
1852-
values.append(result.attribute(n).add_string_value(x[i][n].decode("utf-8")))
1883+
values.append(result.attribute(n).add_string_value(typeconv.to_string(x[i][n])))
1884+
elif type_x[n] == "C":
1885+
values.append(result.attribute(n).index_of(typeconv.to_string(x[i][n])))
18531886
else:
18541887
values.append(result.attribute(n).add_string_value(x[i][n]))
18551888

@@ -1860,7 +1893,9 @@ def create_instances_from_lists(x, y=None, name="data", cols_x=None, col_y=None)
18601893
if type_y == "N":
18611894
values.append(y[i])
18621895
elif type_y == "B":
1863-
values.append(result.attribute(result.num_attributes - 1).add_string_value(y[i].decode("utf-8")))
1896+
values.append(result.attribute(result.num_attributes - 1).add_string_value(typeconv.to_string(y[i])))
1897+
elif type_y == "C":
1898+
values.append(result.attribute(result.num_attributes - 1).index_of(typeconv.to_string(y[i])))
18641899
else:
18651900
values.append(result.attribute(result.num_attributes - 1).add_string_value(y[i]))
18661901

@@ -1869,7 +1904,7 @@ def create_instances_from_lists(x, y=None, name="data", cols_x=None, col_y=None)
18691904
return result
18701905

18711906

1872-
def create_instances_from_matrices(x, y=None, name="data", cols_x=None, col_y=None):
1907+
def create_instances_from_matrices(x, y=None, name="data", cols_x=None, col_y=None, nominal_x=None, nominal_y=False):
18731908
"""
18741909
Allows the generation of an Instances object from a 2-dimensional matrix for X and a
18751910
1-dimensional matrix for Y (optional).
@@ -1887,6 +1922,10 @@ def create_instances_from_matrices(x, y=None, name="data", cols_x=None, col_y=No
18871922
:type cols_x: list
18881923
:param col_y: the column name to use for the output variable (y)
18891924
:type col_y: str
1925+
:param nominal_x: the list of 0-based column indices to treat as nominal ones, ignored if None
1926+
:type nominal_x: list
1927+
:param nominal_y: whether the y column is to be treated as nominal
1928+
:type nominal_y: bool
18901929
:return: the generated dataset
18911930
:rtype: Instances
18921931
"""
@@ -1906,11 +1945,25 @@ def create_instances_from_matrices(x, y=None, name="data", cols_x=None, col_y=No
19061945
if msg is not None:
19071946
raise Exception(msg)
19081947

1948+
# nominal x columns?
1949+
nominal_x_values = None
1950+
if nominal_x is not None:
1951+
nominal_x_values = dict()
1952+
for nominal_col in nominal_x:
1953+
labels = set()
1954+
for n in range(len(x)):
1955+
labels.add(typeconv.to_string(x[n][nominal_col]))
1956+
nominal_x_values[nominal_col] = sorted(list(labels))
1957+
19091958
# create header
19101959
atts = []
19111960
type_x = []
19121961
for i in range(len(x[0])):
19131962
try:
1963+
if (nominal_x_values is not None) and (i in nominal_x_values):
1964+
type_x.append("C") # nominal
1965+
atts.append(Attribute.create_nominal(cols_x[i], nominal_x_values[i]))
1966+
continue
19141967
len(x.dtype)
19151968
if np.issubdtype(x.dtype[i], np.number):
19161969
type_x.append("N") # number
@@ -1926,7 +1979,18 @@ def create_instances_from_matrices(x, y=None, name="data", cols_x=None, col_y=No
19261979
atts.append(Attribute.create_numeric(cols_x[i]))
19271980
type_y = ""
19281981
if y is not None:
1929-
if np.issubdtype(y.dtype, np.number):
1982+
# nominal y column?
1983+
nominal_y_values = None
1984+
if nominal_y:
1985+
labels = set()
1986+
for n in range(len(y)):
1987+
labels.add(typeconv.to_string(y[n]))
1988+
nominal_y_values = sorted(list(labels))
1989+
1990+
if nominal_y:
1991+
type_y = "C"
1992+
atts.append(Attribute.create_nominal(col_y, nominal_y_values))
1993+
elif np.issubdtype(y.dtype, np.number):
19301994
type_y = "N" # number
19311995
atts.append(Attribute.create_numeric(col_y))
19321996
elif np.issubdtype(y.dtype, np.str_):
@@ -1949,8 +2013,10 @@ def create_instances_from_matrices(x, y=None, name="data", cols_x=None, col_y=No
19492013
values.append(x[i][n])
19502014
elif type_x[n] == "S":
19512015
values.append(result.attribute(n).add_string_value(x[i][n]))
2016+
elif type_x[n] == "C":
2017+
values.append(result.attribute(n).index_of(typeconv.to_string(x[i][n])))
19522018
else:
1953-
values.append(result.attribute(n).add_string_value(x[i][n].decode("utf-8")))
2019+
values.append(result.attribute(n).add_string_value(typeconv.to_string(x[i][n])))
19542020

19552021
if y is not None:
19562022
if isinstance(y[i], float) and np.isnan(y[i]):
@@ -1959,8 +2025,10 @@ def create_instances_from_matrices(x, y=None, name="data", cols_x=None, col_y=No
19592025
values.append(y[i])
19602026
elif type_y == "S":
19612027
values.append(result.attribute(result.num_attributes - 1).add_string_value(y[i]))
2028+
elif type_y == "C":
2029+
values.append(result.attribute(result.num_attributes - 1).index_of(typeconv.to_string(y[i])))
19622030
else:
1963-
values.append(result.attribute(result.num_attributes - 1).add_string_value(y[i].decode("utf-8")))
2031+
values.append(result.attribute(result.num_attributes - 1).add_string_value(typeconv.to_string(y[i])))
19642032

19652033
result.add_instance(Instance.create_instance(values))
19662034

python/weka/core/typeconv.py

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
# along with this program. If not, see <http://www.gnu.org/licenses/>.
1313

1414
# typeconv.py
15-
# Copyright (C) 2014-2021 Fracpete (pythonwekawrapper at gmail dot com)
15+
# Copyright (C) 2014-2024 Fracpete (pythonwekawrapper at gmail dot com)
1616

1717
import javabridge
1818
import logging
@@ -138,3 +138,25 @@ def jdouble_to_float(d):
138138
:rtype: float
139139
"""
140140
return javabridge.call(d, "doubleValue", "()D", d)
141+
142+
143+
def to_string(o):
144+
"""
145+
Turns the object into a string.
146+
147+
:param o: the object to convert
148+
:return: the generated string
149+
:rtype: str
150+
"""
151+
if o is None:
152+
return None
153+
154+
result = str(o)
155+
156+
if isinstance(o, bytes):
157+
try:
158+
result = o.decode("utf-8")
159+
except:
160+
pass
161+
162+
return result

0 commit comments

Comments
 (0)