Skip to content

Commit 11d0930

Browse files
authored
Added sort function and tested! fixes #51 (#223)
* added sort function and tested! fixes #51 * added CLI with test New command: sort #51 * applied sort * sort_df_column effects fixed * sort rows byt column 1 and bug fix for negation * sort rows by column 1 and bug fix for negation #51 * renamed sort_columns to have rows and added bools * renamed CLI to `sort` * changes as per #225 * changes as per #225
1 parent efaddb7 commit 11d0930

File tree

7 files changed

+141
-10
lines changed

7 files changed

+141
-10
lines changed

sssom/cli.py

Lines changed: 34 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@
4040
merge_msdf,
4141
reconcile_prefix_and_data,
4242
remove_unmatched,
43+
sort_df_rows_columns,
4344
to_mapping_set_dataframe,
4445
)
4546
from .writers import write_table
@@ -77,9 +78,9 @@
7778
type=click.Path(),
7879
help="The path to a file containing the sssom metadata (including prefix_map) to be used.",
7980
)
80-
transpose_option = click.option("-t", "--transpose/--no-transpose", default=False)
81+
transpose_option = click.option("-t", "--transpose", default=False)
8182
fields_option = click.option(
82-
"-F",
83+
"-f",
8384
"--fields",
8485
nargs=2,
8586
default=("subject_category", "object_category"),
@@ -212,7 +213,7 @@ def dedupe(input: str, output: TextIO):
212213

213214

214215
@main.command()
215-
@click.option("-q", "--query", help='SQL query. Use "df" as table name.')
216+
@click.option("-Q", "--query", help='SQL query. Use "df" as table name.')
216217
@click.argument("inputs", nargs=-1)
217218
@output_option
218219
def dosql(query: str, inputs: List[str], output: TextIO):
@@ -425,7 +426,7 @@ def correlations(input: str, output: TextIO, transpose: bool, fields: Tuple):
425426
@main.command()
426427
@click.argument("inputs", nargs=-1)
427428
@click.option(
428-
"-r",
429+
"-R",
429430
"--reconcile",
430431
default=True,
431432
help="Boolean indicating the need for reconciliation of the SSSOM tsv file.",
@@ -501,5 +502,34 @@ def reconcile_prefixes(input: str, reconcile_prefix_file: Path, output: TextIO):
501502
write_table(recon_msdf, output)
502503

503504

505+
@main.command()
506+
@input_argument
507+
@output_option
508+
@click.option(
509+
"-k",
510+
"--by-columns",
511+
default=True,
512+
help="Sort columns of DataFrame canonically.",
513+
)
514+
@click.option(
515+
"-r",
516+
"--by-rows",
517+
default=True,
518+
help="Sort rows by DataFrame column #1 (ascending).",
519+
)
520+
def sort(input: str, output: TextIO, by_columns: bool, by_rows: bool):
521+
"""
522+
Sort DataFrame columns canonically.
523+
524+
:param input: SSSOM TSV file.
525+
:param by_columns: Boolean flag to sort columns canonically.
526+
:param by_rows: Boolean flag to sort rows by column #1 (ascending order).
527+
:param output: SSSOM TSV file with columns sorted.
528+
"""
529+
msdf = read_sssom_table(input)
530+
msdf.df = sort_df_rows_columns(msdf.df, by_columns, by_rows)
531+
write_table(msdf, output)
532+
533+
504534
if __name__ == "__main__":
505535
main()

sssom/util.py

Lines changed: 32 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -635,7 +635,8 @@ def deal_with_negation(df: pd.DataFrame) -> pd.DataFrame:
635635
reconciled_df_subset = reconciled_df_subset.append(
636636
combined_normalized_subset.loc[
637637
match_condition_1[match_condition_1].index, :
638-
]
638+
],
639+
ignore_index=True,
639640
)
640641

641642
# Add negations (PREDICATE_MODIFIER) back to DataFrame
@@ -657,9 +658,14 @@ def deal_with_negation(df: pd.DataFrame) -> pd.DataFrame:
657658
PREDICATE_MODIFIER
658659
].fillna("")
659660

661+
# .fillna(df) towards the end fills an empty value
662+
# with a corresponding value from df.
663+
# This needs to happen because the columns in df
664+
# not in reconciled_df_subset will be NaN otherwise
665+
# which is incorrect.
660666
reconciled_df = df.merge(
661667
reconciled_df_subset, how="right", on=list(reconciled_df_subset.columns)
662-
).fillna("")
668+
).fillna(df)
663669

664670
if nan_df.empty:
665671
return_df = reconciled_df
@@ -769,7 +775,8 @@ def read_pandas(
769775
else:
770776
sep = "\t"
771777
logging.warning("Cannot automatically determine table format, trying tsv.")
772-
return read_csv(file, comment="#", sep=sep).fillna("")
778+
df = read_csv(file, comment="#", sep=sep).fillna("")
779+
return sort_df_rows_columns(df)
773780

774781

775782
def extract_global_metadata(msdoc: MappingSetDocument) -> Dict[str, PrefixMap]:
@@ -820,6 +827,7 @@ def to_mapping_set_dataframe(doc: MappingSetDocument) -> MappingSetDataFrame:
820827
np.nan, "", inplace=True
821828
)
822829
msdf = MappingSetDataFrame(df=df, prefix_map=doc.prefix_map, metadata=meta)
830+
msdf.df = sort_df_rows_columns(msdf.df)
823831
return msdf
824832

825833

@@ -1117,3 +1125,24 @@ def reconcile_prefix_and_data(
11171125

11181126
# TODO: When expansion of 2 prefixes in the prefix_map are the same.
11191127
return msdf
1128+
1129+
1130+
def sort_df_rows_columns(
1131+
df: pd.DataFrame, by_columns: bool = True, by_rows: bool = True
1132+
) -> pd.DataFrame:
1133+
"""
1134+
Canonical sorting of DataFrame columns.
1135+
1136+
:param df: Pandas DataFrame with random column sequence.
1137+
:param by_columns: Boolean flag to sort columns canonically.
1138+
:param by_rows: Boolean flag to sort rows by column #1 (ascending order).
1139+
:return: Pandas DataFrame columns sorted canonically.
1140+
"""
1141+
if by_columns:
1142+
column_sequence = [
1143+
col for col in SCHEMA_DICT["slots"].keys() if col in df.columns
1144+
]
1145+
df = df.reindex(column_sequence, axis=1)
1146+
if by_rows:
1147+
df = df.sort_values(by=df.columns[0], ignore_index=True)
1148+
return df

tests/data/basic6.tsv

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
#license: "https://creativecommons.org/publicdomain/zero/1.0/"
2+
#mapping_set_id: http://w3id.org/sssom/mapping/tests/data/basic3.tsv
3+
#mapping_tool: "https://github.com/cmungall/rdf_matcher"
4+
#creator_id: "cjm"
5+
#mapping_date: "2020-05-30"
6+
#curie_map:
7+
# a: "http://example.org/a/"
8+
# b: "http://example.org/b/"
9+
# c: "http://example.org/c/"
10+
# d: "http://example.org/d/"
11+
# rdfs: "http://example.org/rdfs/"
12+
# owl: "http://example.org/owl/"
13+
comment mapping_tool subject_label confidence object_id match_type subject_source object_source subject_id subject_match_field object_match_field object_label subject_category predicate_modifier object_category match_string predicate_id
14+
mock data rdf_matcher YYYYY 0.81 b:something Lexical c d c:something rdfs:label rdfs:label yyyyyy biolink:AnatomicalEntity biolink:AnatomicalEntity xxxxx owl:equivalentClass
15+
mock data rdf_matcher YYYYY 0.82 a:something Lexical d a d:something rdfs:label rdfs:label yyyyyy biolink:AnatomicalEntity Not biolink:AnatomicalEntity xxxxx owl:equivalentClass
16+
mock data rdf_matcher XYXYX 0.83 c:something Lexical a c a:something rdfs:label rdfs:label xyxyxy biolink:AnatomicalEntity biolink:AnatomicalEntity xxxxx owl:equivalentClass
17+
mock data rdf_matcher YXYXY 0.845 b:something HumanCurated c b c:something rdfs:label rdfs:label yxyxyx biolink:AnatomicalEntity biolink:AnatomicalEntity xxxxx owl:equivalentClass
18+
mock data rdf_matcher XXXXX 0.8 a:something Lexical b a b:something rdfs:label rdfs:label xxxxxx biolink:AnatomicalEntity biolink:AnatomicalEntity xxxxx owl:equivalentClass
19+
mock data rdf_matcher YYYYY 0.81 d:something Lexical c d c:something rdfs:label rdfs:label yyyyyy biolink:AnatomicalEntity biolink:AnatomicalEntity xxxxx owl:equivalentClass
20+
mock data rdf_matcher YXYXY 0.845 b:something HumanCurated c b c:something rdfs:label rdfs:label yxyxyx biolink:AnatomicalEntity Not biolink:AnatomicalEntity xxxxx owl:equivalentClass
21+
mock data rdf_matcher XYXYX 0.83 b:something Lexical d b d:something rdfs:label rdfs:label xyxyxy biolink:AnatomicalEntity biolink:AnatomicalEntity xxxxx owl:equivalentClass
22+
mock data rdf_matcher XXXXX 0.8 b:something Lexical a b a:something rdfs:label rdfs:label xxxxxx biolink:AnatomicalEntity biolink:AnatomicalEntity xxxxx owl:subClassOf
23+
mock data rdf_matcher YYYYY 0.82 a:something HumanCurated d a d:something rdfs:label rdfs:label yyyyyy biolink:AnatomicalEntity biolink:AnatomicalEntity xxxxx owl:equivalentClass
24+
mock data rdf_matcher XXXXX 0.8 b:something Lexical a b a:something rdfs:label rdfs:label xxxxxx biolink:AnatomicalEntity Not biolink:AnatomicalEntity xxxxx owl:subClassOf

tests/test_cli.py

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
partition,
2020
ptable,
2121
reconcile_prefixes,
22+
sort,
2223
split,
2324
validate,
2425
)
@@ -58,6 +59,7 @@ def test_cli_single_input(self):
5859
self.run_correlations(runner, test)
5960
self.run_reconcile_prefix(runner, test)
6061
self.run_dosql(runner, test)
62+
self.run_sort_rows_columns(runner, test)
6163

6264
self.assertTrue(len(test_cases) > 2)
6365

@@ -258,7 +260,7 @@ def run_dosql(self, runner: CliRunner, test_case: SSSOMTestCase) -> Result:
258260
result = runner.invoke(
259261
dosql,
260262
[
261-
"-q",
263+
"-Q",
262264
"SELECT * FROM df WHERE subject_label = 'heart'",
263265
test_case.filepath,
264266
"-o",
@@ -267,3 +269,24 @@ def run_dosql(self, runner: CliRunner, test_case: SSSOMTestCase) -> Result:
267269
)
268270
self.run_successful(result, test_case)
269271
return result
272+
273+
def run_sort_rows_columns(
274+
self, runner: CliRunner, test_case: SSSOMTestCase
275+
) -> Result:
276+
"""Test sorting of DataFrame columns."""
277+
out_file = os.path.join(test_out_dir, "sort_column_test.tsv")
278+
in_file = test_case.filepath.replace("basic", "basic6")
279+
result = runner.invoke(
280+
sort,
281+
[
282+
in_file,
283+
"-o",
284+
os.path.join(test_out_dir, out_file),
285+
"-k",
286+
True,
287+
"-r",
288+
True,
289+
],
290+
)
291+
self.run_successful(result, test_case)
292+
return result

tests/test_parsers.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
read_sssom_table,
2222
to_mapping_set_document,
2323
)
24-
from sssom.util import PREFIX_MAP_KEY, to_mapping_set_dataframe
24+
from sssom.util import PREFIX_MAP_KEY, sort_df_rows_columns, to_mapping_set_dataframe
2525
from sssom.writers import write_table
2626
from tests.test_data import data_dir as test_data_dir
2727
from tests.test_data import test_out_dir
@@ -190,6 +190,7 @@ def test_read_sssom_table(self):
190190
input_path = os.path.join(test_data_dir, "basic3.tsv")
191191
msdf = read_sssom_table(input_path)
192192
imported_df = pd.read_csv(input_path, comment="#", sep="\t")
193+
imported_df = sort_df_rows_columns(imported_df)
193194
self.assertEqual(set(imported_df.columns), set(msdf.df.columns))
194195
list_cols = [
195196
"subject_match_field",

tests/test_reconcile.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ def test_filter(self):
2323
def test_deal_with_negation(self):
2424
"""Test handling negating returns the right number of rows."""
2525
df = deal_with_negation(self.msdf.df)
26-
self.assertEqual(7, len(df.index))
26+
self.assertEqual(8, len(df.index))
2727

2828
def test_merge(self):
2929
"""Test merging two tables."""

tests/test_sort.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
"""Test for sorting MappingSetDataFrame columns."""
2+
3+
import unittest
4+
5+
from sssom.constants import SCHEMA_DICT
6+
from sssom.parsers import read_sssom_table
7+
from sssom.util import sort_df_rows_columns
8+
from tests.constants import data_dir
9+
10+
11+
class TestSort(unittest.TestCase):
12+
"""A test case for sorting msdf columns."""
13+
14+
def setUp(self) -> None:
15+
"""Test up the test cases with the third basic example."""
16+
self.msdf = read_sssom_table(f"{data_dir}/basic6.tsv")
17+
18+
def test_sort(self):
19+
"""Test sorting of columns."""
20+
new_df = sort_df_rows_columns(self.msdf.df)
21+
column_sequence = [
22+
col for col in SCHEMA_DICT["slots"].keys() if col in new_df.columns
23+
]
24+
self.assertListEqual(column_sequence, list(new_df.columns))

0 commit comments

Comments
 (0)