Skip to content

Commit a66e80b

Browse files
committed
09_git.ipynb: Split up _repo_name_to_out_basename()
Part 1 of refactoring of filename generation for storing the graph structure of a commit graph (of a repository) and any directed graph, and for storing per-node data like reachability labels. The idea is to have common code for this part of generating the commit graph and for storing Dataframes with the data about graph. The _repo_name_to_out_basename() got split into separate parts that are responsible for separate parts of transformation. The final function, _savefile_name() generates whole name, not just the prefix / base of it. The transformation now goes like this repo path -> repo name -> commit graph name -> save file name Note however that the newly created _repo_graph_savefile() is not used yet in commit_graph() function.
1 parent 8218c31 commit a66e80b

File tree

2 files changed

+390
-63
lines changed

2 files changed

+390
-63
lines changed

09_git.ipynb

Lines changed: 233 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -13,16 +13,7 @@
1313
"cell_type": "code",
1414
"execution_count": null,
1515
"metadata": {},
16-
"outputs": [
17-
{
18-
"name": "stdout",
19-
"output_type": "stream",
20-
"text": [
21-
"The autoreload extension is already loaded. To reload it, use:\n",
22-
" %reload_ext autoreload\n"
23-
]
24-
}
25-
],
16+
"outputs": [],
2617
"source": [
2718
"#hide_output\n",
2819
"%load_ext autoreload\n",
@@ -983,56 +974,234 @@
983974
"outputs": [],
984975
"source": [
985976
"#export\n",
986-
"def _repo_name_to_out_basename(repo_name, out_dir=\"datasets\"):\n",
987-
" \"\"\"Create basename for file that would store graph's `DataFrame`s\n",
977+
"def _repo_basename(repo_path):\n",
978+
" \"\"\"Create name of repository out of its pathname\n",
988979
"\n",
989980
" This is a helper function used, among others, in ...\n",
990981
"\n",
991982
" Examples:\n",
992983
" ---------\n",
993-
" >>>> _repo_name_to_out_basename('hellogitworld')\n",
994-
" Path('datasets/hellogitworld')\n",
995-
" >>>> _repo_name_to_out_basename('hellogitworld.git', out_dir='data')\n",
996-
" Path('data/hellogitworld')\n",
984+
" >>>> _repo_basename('hellogitworld')\n",
985+
" 'hellogitworld'\n",
986+
" >>>> _repo_basename('repos/hellogitworld.git')\n",
987+
" 'hellogitworld'\n",
988+
"\n",
989+
" Parameters\n",
990+
" ----------\n",
991+
" repo_path : str\n",
992+
" Path to the Git repository, for example 'hellogitworld',\n",
993+
" or 'hellogitworld.git', or 'repos/hellogitworld.git', or 'repos/hellogitworld/.git'\n",
994+
" for Git repository cloned from <https://github.com/githubtraining/hellogitworld>\n",
995+
" \n",
996+
" Returns\n",
997+
" -------\n",
998+
" str\n",
999+
" The name of repository, to be later used as a base for commit graph name,\n",
1000+
" and the pathname of a file where to save information about thw history\n",
1001+
" structure of the repository.\n",
1002+
" \"\"\"\n",
1003+
" # generate the name of the output file, as `pathlib.Path` object\n",
1004+
" # removing the '*.git' extension / suffix from `repo_path`, if needed\n",
1005+
" out_pathname = Path(repo_name).stem\n",
1006+
"\n",
1007+
" # convert it to string\n",
1008+
" return str(out_pathname)\n",
1009+
"\n",
1010+
"\n",
1011+
"def _commit_graph_name(repo_name):\n",
1012+
" \"\"\"Create the name of a commit graph out of repository name.\n",
1013+
" \n",
1014+
" This is a helper function used, among others, in ...\n",
9971015
"\n",
1016+
" Examples:\n",
1017+
" ---------\n",
1018+
" >>>> _commit_graph_name('hellogitworld')\n",
1019+
" 'hellogitworld-commit_graph'\n",
1020+
" \n",
9981021
" Parameters\n",
9991022
" ----------\n",
10001023
" repo_name : str\n",
1001-
" Name of the Git repository (`<graph>.name` can be used), for example\n",
1002-
" 'hellogitworld' or 'hellogitworld.git' for Git repository cloned from\n",
1003-
" <https://github.com/githubtraining/hellogitworld>\n",
1024+
" The name of the repository, for example the result of calling the\n",
1025+
" `_repo_basename()` function.\n",
1026+
" \n",
1027+
" Returns\n",
1028+
" -------\n",
1029+
" str\n",
1030+
" The name of the commit graph of the repository, to be used as a base\n",
1031+
" for the pathname of a file where to save information about the history\n",
1032+
" structure of the repository.\n",
1033+
" \"\"\"\n",
1034+
" return repo_name + '-commit_graph'\n",
10041035
"\n",
1036+
"\n",
1037+
"def _repo_graph_name(repo_path):\n",
1038+
" \"\"\"Create the name of a commit graph out of repository path (its pathname)\n",
1039+
" \n",
1040+
" This is a helper function used, among others, in ...\n",
1041+
"\n",
1042+
" Examples:\n",
1043+
" ---------\n",
1044+
" >>>> _repo_graph_name('repos/hellogitworld.git')\n",
1045+
" 'hellogitworld-commit_graph'\n",
1046+
" \n",
1047+
" Parameters\n",
1048+
" ----------\n",
1049+
" repo_path : str\n",
1050+
" Path to the Git repository (`<graph>.name` can be used), for example\n",
1051+
" 'hellogitworld', or 'hellogitworld.git', or 'repos/hellogitworld.git',\n",
1052+
" or 'repos/hellogitworld/.git' for Git repository cloned from\n",
1053+
" <https://github.com/githubtraining/hellogitworld>\n",
1054+
" \n",
1055+
" Returns\n",
1056+
" -------\n",
1057+
" str\n",
1058+
" The name of the commit graph of the repository, to be used as a base\n",
1059+
" for the pathname of a file where to save information about the history\n",
1060+
" structure of the repository.\n",
1061+
" \"\"\"\n",
1062+
" return _commit_graph_name(_repo_basename(repo_path))\n",
1063+
"\n"
1064+
]
1065+
},
1066+
{
1067+
"cell_type": "markdown",
1068+
"metadata": {},
1069+
"source": [
1070+
"<u>Test</u> that examples from the docstring works:"
1071+
]
1072+
},
1073+
{
1074+
"cell_type": "code",
1075+
"execution_count": null,
1076+
"metadata": {},
1077+
"outputs": [],
1078+
"source": [
1079+
"assert _repo_basename('hellogitworld') == 'hellogitworld'\n",
1080+
"assert _repo_basename('hellogitworld.git') == 'hellogitworld'\n",
1081+
"assert _repo_basename('repos/hellogitworld.git') == 'hellogitworld'\n",
1082+
"assert _repo_basename('repos/hellogitworld/.git') == 'hellogitworld'"
1083+
]
1084+
},
1085+
{
1086+
"cell_type": "code",
1087+
"execution_count": null,
1088+
"metadata": {},
1089+
"outputs": [],
1090+
"source": [
1091+
"assert _commit_graph_name('hellogitworld') == 'hellogitworld-commit_graph'"
1092+
]
1093+
},
1094+
{
1095+
"cell_type": "code",
1096+
"execution_count": null,
1097+
"metadata": {},
1098+
"outputs": [],
1099+
"source": [
1100+
"assert _repo_graph_name('repos/hellogitworld.git') == 'hellogitworld-commit_graph'"
1101+
]
1102+
},
1103+
{
1104+
"cell_type": "code",
1105+
"execution_count": null,
1106+
"metadata": {},
1107+
"outputs": [],
1108+
"source": [
1109+
"#export\n",
1110+
"def _savefile_name(graph_name, out_dir='datasets', kind='df_edgelist', file_format='csv.gz'):\n",
1111+
" \"\"\"Create filename for storing graph structure and other graph data\n",
1112+
" \n",
1113+
" This is a helper function used, among others, in ...\n",
1114+
" \n",
1115+
" Examples:\n",
1116+
" ---------\n",
1117+
" >>>> _savefile_name('example_graph')\n",
1118+
" Path('datasets/example_graph.df_edgelist.csv.gz')\n",
1119+
" \n",
1120+
" Parameters\n",
1121+
" ----------\n",
1122+
" graph_name : str\n",
1123+
" Name of the graph (`<graph>.name` can be used).\n",
1124+
" \n",
10051125
" out_dir : str\n",
10061126
" Directory where saved commit graph data would be stored.\n",
10071127
" Defaults to \"datasets\".\n",
1008-
"\n",
1128+
" \n",
1129+
" kind : str\n",
1130+
" What type of data is stored in a file, and in what representation.\n",
1131+
" The default value is 'df_edgelist', used to store graph structure in\n",
1132+
" the edge list format in a `pandas.DataFrame`.\n",
1133+
" \n",
1134+
" file_format : str\n",
1135+
" Format of a file, for example how the `DataFrame` is saved.\n",
1136+
" Defaults to 'csv.gz' (gzip-compressed Comma Separated Values).\n",
1137+
" \n",
10091138
" Returns\n",
10101139
" -------\n",
10111140
" Path\n",
1012-
" Basename of path to store graph structure or graph data. To be\n",
1013-
" concatenated with data-describing suffix and a proper extension.\n",
1141+
" Path to the file storing the graph structure or graph data in\n",
1142+
" the appropriate representation and appropriate file format.\n",
10141143
" \"\"\"\n",
10151144
" # The `out_dir` should not be None\n",
10161145
" if out_dir is None:\n",
10171146
" out_dir = \".\"\n",
1147+
"\n",
1148+
" # compose the basename of the pathname\n",
1149+
" filename = graph_name\n",
1150+
" # TODO: there would special case for saving to HDF5 files, which can\n",
1151+
" # store multiple data in a single file, so there would be no need\n",
1152+
" # to add <kind> to basename of such output file\n",
1153+
" if kind is not None and kind != '':\n",
1154+
" filename += '.' + kind\n",
1155+
" if file_format is not None and file_format != '':\n",
1156+
" filename += '.' + file_format\n",
10181157
" # generate the name of the output file, as `pathlib.Path` object\n",
1019-
" # removing the '*.git' extension / suffix from `repo_path`, if needed\n",
1020-
" out_pathname = Path(out_dir) / Path(repo_name).stem\n",
1158+
" return Path(out_dir) / filename\n",
10211159
"\n",
1022-
" return out_pathname\n",
10231160
"\n",
1161+
"def _out_basename(graph_name, out_dir='datasets'):\n",
1162+
" return _savefile_name(graph_name, out_dir=out_dir, kind=None, file_format=None)\n",
10241163
"\n",
1025-
"def _repo_name_to_edgelist_basename(repo_name, out_dir=\"datasets\"):\n",
1026-
" return str(_repo_name_to_out_basename(repo_name, out_dir=out_dir)) + \\\n",
1027-
" '-commit_graph'+'.df_edgelist'\n",
1028-
"\n"
1164+
"\n",
1165+
"def _repo_graph_savefile(repo_path, out_dir='datasets'):\n",
1166+
" \"\"\"Create filename for storing adjacency list out of repository path\n",
1167+
"\n",
1168+
" This is a helper function used, among others, in ...\n",
1169+
"\n",
1170+
" Examples:\n",
1171+
" ---------\n",
1172+
" >>>> _repo_graph_savefile('repos/hellogitworld.git')\n",
1173+
" Path('datasets/hellogitworld-commit_graph.adjlist.txt')\n",
1174+
" >>>> _repo_graph_savefile('repos/hellogitworld.git', out_dir='data')\n",
1175+
" Path('data/hellogitworld-commit_graph.adjlist.txt')\n",
1176+
"\n",
1177+
" Parameters\n",
1178+
" ----------\n",
1179+
" repo_path : str\n",
1180+
" Path to the Git repository, for example 'hellogitworld',\n",
1181+
" or 'hellogitworld.git', or 'repos/hellogitworld.git', or 'repos/hellogitworld/.git'\n",
1182+
" for Git repository cloned from <https://github.com/githubtraining/hellogitworld>\n",
1183+
"\n",
1184+
" out_dir : str\n",
1185+
" Directory where extracted commit graph data would be stored.\n",
1186+
" Defaults to \"datasets\".\n",
1187+
"\n",
1188+
" Returns\n",
1189+
" -------\n",
1190+
" Path\n",
1191+
" Path to the file storing the commit graph in the adjacency list\n",
1192+
" file format.\n",
1193+
"\n",
1194+
" see: https://networkx.org/documentation/stable/reference/readwrite/adjlist.html\n",
1195+
" \"\"\"\n",
1196+
" graph_name = _repo_graph_name(repo_path)\n",
1197+
" return _savefile_name(graph_name, out_dir=out_dir, kind='adjlist', file_format='txt')"
10291198
]
10301199
},
10311200
{
10321201
"cell_type": "markdown",
10331202
"metadata": {},
10341203
"source": [
1035-
"<u>Test</u> that example from the docstring works:"
1204+
"<u>Test</u> that examples from the docstring works:"
10361205
]
10371206
},
10381207
{
@@ -1041,8 +1210,9 @@
10411210
"metadata": {},
10421211
"outputs": [],
10431212
"source": [
1044-
"assert _repo_name_to_out_basename('hellogitworld') == Path('datasets/hellogitworld')\n",
1045-
"assert _repo_name_to_out_basename('hellogitworld.git', out_dir='data') == Path('data/hellogitworld')"
1213+
"assert _savefile_name('example_graph') == Path('datasets/example_graph.df_edgelist.csv.gz')\n",
1214+
"assert _repo_graph_savefile('repos/hellogitworld.git') == Path('datasets/hellogitworld-commit_graph.adjlist.txt')\n",
1215+
"assert _repo_graph_savefile('repos/hellogitworld.git', out_dir='data') == Path('data/hellogitworld-commit_graph.adjlist.txt')"
10461216
]
10471217
},
10481218
{
@@ -1121,8 +1291,9 @@
11211291
"source": [
11221292
"#export\n",
11231293
"def save_graph_df(df, graph_name, datasets_dir='datasets', output_format='csv.gz', overwrite=False):\n",
1124-
" filename = _repo_name_to_edgelist_basename(graph_name, out_dir=datasets_dir) + \\\n",
1125-
" '.'+output_format\n",
1294+
" filename = _savefile_name(graph_name, out_dir=datasets_dir,\n",
1295+
" kind='df_edgelist', file_format=output_format)\n",
1296+
" print('-> filename:', filename)\n",
11261297
" if not overwrite and Path(filename).is_file():\n",
11271298
" return\n",
11281299
" if output_format == 'csv' or output_format == 'csv.gz':\n",
@@ -1141,7 +1312,7 @@
11411312
" else:\n",
11421313
" raise RuntimeError(\"Neither 'graph_name' parameter given, nor 'graph' has 'name' attribute\")\n",
11431314
"\n",
1144-
" print('graph_name: ', graph_name)\n",
1315+
" print('-> graph_name:', graph_name)\n",
11451316
" save_graph_df(df, graph_name,\n",
11461317
" datasets_dir=datasets_dir, output_format=output_format, overwrite=overwrite)\n",
11471318
"\n",
@@ -1154,8 +1325,9 @@
11541325
"\n",
11551326
"\n",
11561327
"def load_graph_df(graph_name, datasets_dir='datasets', input_format='csv.gz'):\n",
1157-
" filename = _repo_name_to_edgelist_basename(graph_name, out_dir=datasets_dir) + \\\n",
1158-
" '.'+input_format\n",
1328+
" filename = _savefile_name(graph_name, out_dir=datasets_dir,\n",
1329+
" kind='df_edgelist', file_format=input_format)\n",
1330+
" print('<- filename:', filename)\n",
11591331
" return load_graph_df_from_file(filename, input_format=input_format)\n",
11601332
"\n"
11611333
]
@@ -1176,15 +1348,24 @@
11761348
"name": "stdout",
11771349
"output_type": "stream",
11781350
"text": [
1351+
"repository path: repos/hellogitworld.git\n",
11791352
"saving commit graph of: hellogitworld.git\n",
1180-
"graph_name: hellogitworld.git\n",
1353+
"graph.name = hellogitworld.git\n",
1354+
"commit graph name: hellogitworld-commit_graph\n",
1355+
"testing save_graph()\n",
1356+
"-> graph_name: hellogitworld.git\n",
1357+
"-> filename: datasets\\hellogitworld.git.df_edgelist.csv.gz\n",
1358+
"testing save_graph_df()\n",
1359+
"-> filename: datasets\\hellogitworld-commit_graph.df_edgelist.csv.gz\n",
11811360
"there should be appropriately named file in the list below:\n"
11821361
]
11831362
},
11841363
{
11851364
"data": {
11861365
"text/plain": [
1187-
"['hellogitworld-commit_graph.df_edgelist.csv.gz 523']"
1366+
"['hellogitworld-commit_graph.adjlist.txt 868',\n",
1367+
" 'hellogitworld-commit_graph.df_edgelist.csv.gz 523',\n",
1368+
" 'hellogitworld.git.df_edgelist.csv.gz 514']"
11881369
]
11891370
},
11901371
"execution_count": null,
@@ -1193,9 +1374,14 @@
11931374
}
11941375
],
11951376
"source": [
1377+
"print('repository path: {}'.format(repo_path))\n",
11961378
"print('saving commit graph of: {}'.format(repo_name))\n",
1379+
"print('graph.name = {}'.format(repo_graph.name))\n",
1380+
"print('commit graph name: {}'.format(_repo_graph_name(repo_path)))\n",
1381+
"print('testing save_graph()')\n",
11971382
"save_graph(repo_graph)\n",
1198-
"save_graph_df(repo_graph_df, graph_name=repo_name)\n",
1383+
"print('testing save_graph_df()')\n",
1384+
"save_graph_df(repo_graph_df, graph_name=_repo_graph_name(repo_path))\n",
11991385
"print('there should be appropriately named file in the list below:')\n",
12001386
"[\"{name:<50} {size:>7}\".format(name=p.name,size=p.stat().st_size) for p in Path(\"datasets\").glob(\"hellogitworld*\")]"
12011387
]
@@ -1210,6 +1396,7 @@
12101396
"output_type": "stream",
12111397
"text": [
12121398
"restoring commit graph of: hellogitworld.git\n",
1399+
"<- filename: datasets\\hellogitworld.git.df_edgelist.csv.gz\n",
12131400
"ok - dataset and restored dataset are equal\n"
12141401
]
12151402
}
@@ -1265,7 +1452,9 @@
12651452
"- graph\n",
12661453
"- lvl\n",
12671454
"- mpi_ext\n",
1268-
"reachability labels should be computed for all nodes\n"
1455+
"ok - graph has both \"lvl\" and \"mpi_ext\" attributes\n",
1456+
"reachability labels should be computed for all nodes\n",
1457+
"ok - both lvl and mpi_ext keys are all 55 graph nodes\n"
12691458
]
12701459
}
12711460
],
@@ -1279,10 +1468,12 @@
12791468
" print('- {:s}'.format(attr))\n",
12801469
"assert hasattr(repo_graph, 'lvl')\n",
12811470
"assert hasattr(repo_graph, 'mpi_ext')\n",
1471+
"print('ok - graph has both \"{}\" and \"{}\" attributes'.format('lvl', 'mpi_ext'))\n",
12821472
" \n",
12831473
"print('reachability labels should be computed for all nodes')\n",
12841474
"assert set(repo_graph.lvl.keys()) == set(repo_graph.nodes)\n",
1285-
"assert set(repo_graph.mpi_ext.keys()) == set(repo_graph.nodes)"
1475+
"assert set(repo_graph.mpi_ext.keys()) == set(repo_graph.nodes)\n",
1476+
"print('ok - both lvl and mpi_ext keys are all {} graph nodes'.format(len(repo_graph.nodes)))"
12861477
]
12871478
},
12881479
{

0 commit comments

Comments
 (0)