|
13 | 13 | "cell_type": "code",
|
14 | 14 | "execution_count": null,
|
15 | 15 | "metadata": {},
|
16 |
| - "outputs": [ |
17 |
| - { |
18 |
| - "name": "stdout", |
19 |
| - "output_type": "stream", |
20 |
| - "text": [ |
21 |
| - "The autoreload extension is already loaded. To reload it, use:\n", |
22 |
| - " %reload_ext autoreload\n" |
23 |
| - ] |
24 |
| - } |
25 |
| - ], |
| 16 | + "outputs": [], |
26 | 17 | "source": [
|
27 | 18 | "#hide_output\n",
|
28 | 19 | "%load_ext autoreload\n",
|
|
983 | 974 | "outputs": [],
|
984 | 975 | "source": [
|
985 | 976 | "#export\n",
|
986 |
| - "def _repo_name_to_out_basename(repo_name, out_dir=\"datasets\"):\n", |
987 |
| - " \"\"\"Create basename for file that would store graph's `DataFrame`s\n", |
| 977 | + "def _repo_basename(repo_path):\n", |
| 978 | + " \"\"\"Create name of repository out of its pathname\n", |
988 | 979 | "\n",
|
989 | 980 | " This is a helper function used, among others, in ...\n",
|
990 | 981 | "\n",
|
991 | 982 | " Examples:\n",
|
992 | 983 | " ---------\n",
|
993 |
| - " >>>> _repo_name_to_out_basename('hellogitworld')\n", |
994 |
| - " Path('datasets/hellogitworld')\n", |
995 |
| - " >>>> _repo_name_to_out_basename('hellogitworld.git', out_dir='data')\n", |
996 |
| - " Path('data/hellogitworld')\n", |
| 984 | + " >>>> _repo_basename('hellogitworld')\n", |
| 985 | + " 'hellogitworld'\n", |
| 986 | + " >>>> _repo_basename('repos/hellogitworld.git')\n", |
| 987 | + " 'hellogitworld'\n", |
| 988 | + "\n", |
| 989 | + " Parameters\n", |
| 990 | + " ----------\n", |
| 991 | + " repo_path : str\n", |
| 992 | + " Path to the Git repository, for example 'hellogitworld',\n", |
| 993 | + " or 'hellogitworld.git', or 'repos/hellogitworld.git', or 'repos/hellogitworld/.git'\n", |
| 994 | + " for Git repository cloned from <https://github.com/githubtraining/hellogitworld>\n", |
| 995 | + " \n", |
| 996 | + " Returns\n", |
| 997 | + " -------\n", |
| 998 | + " str\n", |
| 999 | + " The name of repository, to be later used as a base for commit graph name,\n", |
| 1000 | + " and the pathname of a file where to save information about thw history\n", |
| 1001 | + " structure of the repository.\n", |
| 1002 | + " \"\"\"\n", |
| 1003 | + " # generate the name of the output file, as `pathlib.Path` object\n", |
| 1004 | + " # removing the '*.git' extension / suffix from `repo_path`, if needed\n", |
| 1005 | + " out_pathname = Path(repo_name).stem\n", |
| 1006 | + "\n", |
| 1007 | + " # convert it to string\n", |
| 1008 | + " return str(out_pathname)\n", |
| 1009 | + "\n", |
| 1010 | + "\n", |
| 1011 | + "def _commit_graph_name(repo_name):\n", |
| 1012 | + " \"\"\"Create the name of a commit graph out of repository name.\n", |
| 1013 | + " \n", |
| 1014 | + " This is a helper function used, among others, in ...\n", |
997 | 1015 | "\n",
|
| 1016 | + " Examples:\n", |
| 1017 | + " ---------\n", |
| 1018 | + " >>>> _commit_graph_name('hellogitworld')\n", |
| 1019 | + " 'hellogitworld-commit_graph'\n", |
| 1020 | + " \n", |
998 | 1021 | " Parameters\n",
|
999 | 1022 | " ----------\n",
|
1000 | 1023 | " repo_name : str\n",
|
1001 |
| - " Name of the Git repository (`<graph>.name` can be used), for example\n", |
1002 |
| - " 'hellogitworld' or 'hellogitworld.git' for Git repository cloned from\n", |
1003 |
| - " <https://github.com/githubtraining/hellogitworld>\n", |
| 1024 | + " The name of the repository, for example the result of calling the\n", |
| 1025 | + " `_repo_basename()` function.\n", |
| 1026 | + " \n", |
| 1027 | + " Returns\n", |
| 1028 | + " -------\n", |
| 1029 | + " str\n", |
| 1030 | + " The name of the commit graph of the repository, to be used as a base\n", |
| 1031 | + " for the pathname of a file where to save information about the history\n", |
| 1032 | + " structure of the repository.\n", |
| 1033 | + " \"\"\"\n", |
| 1034 | + " return repo_name + '-commit_graph'\n", |
1004 | 1035 | "\n",
|
| 1036 | + "\n", |
| 1037 | + "def _repo_graph_name(repo_path):\n", |
| 1038 | + " \"\"\"Create the name of a commit graph out of repository path (its pathname)\n", |
| 1039 | + " \n", |
| 1040 | + " This is a helper function used, among others, in ...\n", |
| 1041 | + "\n", |
| 1042 | + " Examples:\n", |
| 1043 | + " ---------\n", |
| 1044 | + " >>>> _repo_graph_name('repos/hellogitworld.git')\n", |
| 1045 | + " 'hellogitworld-commit_graph'\n", |
| 1046 | + " \n", |
| 1047 | + " Parameters\n", |
| 1048 | + " ----------\n", |
| 1049 | + " repo_path : str\n", |
| 1050 | + " Path to the Git repository (`<graph>.name` can be used), for example\n", |
| 1051 | + " 'hellogitworld', or 'hellogitworld.git', or 'repos/hellogitworld.git',\n", |
| 1052 | + " or 'repos/hellogitworld/.git' for Git repository cloned from\n", |
| 1053 | + " <https://github.com/githubtraining/hellogitworld>\n", |
| 1054 | + " \n", |
| 1055 | + " Returns\n", |
| 1056 | + " -------\n", |
| 1057 | + " str\n", |
| 1058 | + " The name of the commit graph of the repository, to be used as a base\n", |
| 1059 | + " for the pathname of a file where to save information about the history\n", |
| 1060 | + " structure of the repository.\n", |
| 1061 | + " \"\"\"\n", |
| 1062 | + " return _commit_graph_name(_repo_basename(repo_path))\n", |
| 1063 | + "\n" |
| 1064 | + ] |
| 1065 | + }, |
| 1066 | + { |
| 1067 | + "cell_type": "markdown", |
| 1068 | + "metadata": {}, |
| 1069 | + "source": [ |
| 1070 | + "<u>Test</u> that examples from the docstring works:" |
| 1071 | + ] |
| 1072 | + }, |
| 1073 | + { |
| 1074 | + "cell_type": "code", |
| 1075 | + "execution_count": null, |
| 1076 | + "metadata": {}, |
| 1077 | + "outputs": [], |
| 1078 | + "source": [ |
| 1079 | + "assert _repo_basename('hellogitworld') == 'hellogitworld'\n", |
| 1080 | + "assert _repo_basename('hellogitworld.git') == 'hellogitworld'\n", |
| 1081 | + "assert _repo_basename('repos/hellogitworld.git') == 'hellogitworld'\n", |
| 1082 | + "assert _repo_basename('repos/hellogitworld/.git') == 'hellogitworld'" |
| 1083 | + ] |
| 1084 | + }, |
| 1085 | + { |
| 1086 | + "cell_type": "code", |
| 1087 | + "execution_count": null, |
| 1088 | + "metadata": {}, |
| 1089 | + "outputs": [], |
| 1090 | + "source": [ |
| 1091 | + "assert _commit_graph_name('hellogitworld') == 'hellogitworld-commit_graph'" |
| 1092 | + ] |
| 1093 | + }, |
| 1094 | + { |
| 1095 | + "cell_type": "code", |
| 1096 | + "execution_count": null, |
| 1097 | + "metadata": {}, |
| 1098 | + "outputs": [], |
| 1099 | + "source": [ |
| 1100 | + "assert _repo_graph_name('repos/hellogitworld.git') == 'hellogitworld-commit_graph'" |
| 1101 | + ] |
| 1102 | + }, |
| 1103 | + { |
| 1104 | + "cell_type": "code", |
| 1105 | + "execution_count": null, |
| 1106 | + "metadata": {}, |
| 1107 | + "outputs": [], |
| 1108 | + "source": [ |
| 1109 | + "#export\n", |
| 1110 | + "def _savefile_name(graph_name, out_dir='datasets', kind='df_edgelist', file_format='csv.gz'):\n", |
| 1111 | + " \"\"\"Create filename for storing graph structure and other graph data\n", |
| 1112 | + " \n", |
| 1113 | + " This is a helper function used, among others, in ...\n", |
| 1114 | + " \n", |
| 1115 | + " Examples:\n", |
| 1116 | + " ---------\n", |
| 1117 | + " >>>> _savefile_name('example_graph')\n", |
| 1118 | + " Path('datasets/example_graph.df_edgelist.csv.gz')\n", |
| 1119 | + " \n", |
| 1120 | + " Parameters\n", |
| 1121 | + " ----------\n", |
| 1122 | + " graph_name : str\n", |
| 1123 | + " Name of the graph (`<graph>.name` can be used).\n", |
| 1124 | + " \n", |
1005 | 1125 | " out_dir : str\n",
|
1006 | 1126 | " Directory where saved commit graph data would be stored.\n",
|
1007 | 1127 | " Defaults to \"datasets\".\n",
|
1008 |
| - "\n", |
| 1128 | + " \n", |
| 1129 | + " kind : str\n", |
| 1130 | + " What type of data is stored in a file, and in what representation.\n", |
| 1131 | + " The default value is 'df_edgelist', used to store graph structure in\n", |
| 1132 | + " the edge list format in a `pandas.DataFrame`.\n", |
| 1133 | + " \n", |
| 1134 | + " file_format : str\n", |
| 1135 | + " Format of a file, for example how the `DataFrame` is saved.\n", |
| 1136 | + " Defaults to 'csv.gz' (gzip-compressed Comma Separated Values).\n", |
| 1137 | + " \n", |
1009 | 1138 | " Returns\n",
|
1010 | 1139 | " -------\n",
|
1011 | 1140 | " Path\n",
|
1012 |
| - " Basename of path to store graph structure or graph data. To be\n", |
1013 |
| - " concatenated with data-describing suffix and a proper extension.\n", |
| 1141 | + " Path to the file storing the graph structure or graph data in\n", |
| 1142 | + " the appropriate representation and appropriate file format.\n", |
1014 | 1143 | " \"\"\"\n",
|
1015 | 1144 | " # The `out_dir` should not be None\n",
|
1016 | 1145 | " if out_dir is None:\n",
|
1017 | 1146 | " out_dir = \".\"\n",
|
| 1147 | + "\n", |
| 1148 | + " # compose the basename of the pathname\n", |
| 1149 | + " filename = graph_name\n", |
| 1150 | + " # TODO: there would special case for saving to HDF5 files, which can\n", |
| 1151 | + " # store multiple data in a single file, so there would be no need\n", |
| 1152 | + " # to add <kind> to basename of such output file\n", |
| 1153 | + " if kind is not None and kind != '':\n", |
| 1154 | + " filename += '.' + kind\n", |
| 1155 | + " if file_format is not None and file_format != '':\n", |
| 1156 | + " filename += '.' + file_format\n", |
1018 | 1157 | " # generate the name of the output file, as `pathlib.Path` object\n",
|
1019 |
| - " # removing the '*.git' extension / suffix from `repo_path`, if needed\n", |
1020 |
| - " out_pathname = Path(out_dir) / Path(repo_name).stem\n", |
| 1158 | + " return Path(out_dir) / filename\n", |
1021 | 1159 | "\n",
|
1022 |
| - " return out_pathname\n", |
1023 | 1160 | "\n",
|
| 1161 | + "def _out_basename(graph_name, out_dir='datasets'):\n", |
| 1162 | + " return _savefile_name(graph_name, out_dir=out_dir, kind=None, file_format=None)\n", |
1024 | 1163 | "\n",
|
1025 |
| - "def _repo_name_to_edgelist_basename(repo_name, out_dir=\"datasets\"):\n", |
1026 |
| - " return str(_repo_name_to_out_basename(repo_name, out_dir=out_dir)) + \\\n", |
1027 |
| - " '-commit_graph'+'.df_edgelist'\n", |
1028 |
| - "\n" |
| 1164 | + "\n", |
| 1165 | + "def _repo_graph_savefile(repo_path, out_dir='datasets'):\n", |
| 1166 | + " \"\"\"Create filename for storing adjacency list out of repository path\n", |
| 1167 | + "\n", |
| 1168 | + " This is a helper function used, among others, in ...\n", |
| 1169 | + "\n", |
| 1170 | + " Examples:\n", |
| 1171 | + " ---------\n", |
| 1172 | + " >>>> _repo_graph_savefile('repos/hellogitworld.git')\n", |
| 1173 | + " Path('datasets/hellogitworld-commit_graph.adjlist.txt')\n", |
| 1174 | + " >>>> _repo_graph_savefile('repos/hellogitworld.git', out_dir='data')\n", |
| 1175 | + " Path('data/hellogitworld-commit_graph.adjlist.txt')\n", |
| 1176 | + "\n", |
| 1177 | + " Parameters\n", |
| 1178 | + " ----------\n", |
| 1179 | + " repo_path : str\n", |
| 1180 | + " Path to the Git repository, for example 'hellogitworld',\n", |
| 1181 | + " or 'hellogitworld.git', or 'repos/hellogitworld.git', or 'repos/hellogitworld/.git'\n", |
| 1182 | + " for Git repository cloned from <https://github.com/githubtraining/hellogitworld>\n", |
| 1183 | + "\n", |
| 1184 | + " out_dir : str\n", |
| 1185 | + " Directory where extracted commit graph data would be stored.\n", |
| 1186 | + " Defaults to \"datasets\".\n", |
| 1187 | + "\n", |
| 1188 | + " Returns\n", |
| 1189 | + " -------\n", |
| 1190 | + " Path\n", |
| 1191 | + " Path to the file storing the commit graph in the adjacency list\n", |
| 1192 | + " file format.\n", |
| 1193 | + "\n", |
| 1194 | + " see: https://networkx.org/documentation/stable/reference/readwrite/adjlist.html\n", |
| 1195 | + " \"\"\"\n", |
| 1196 | + " graph_name = _repo_graph_name(repo_path)\n", |
| 1197 | + " return _savefile_name(graph_name, out_dir=out_dir, kind='adjlist', file_format='txt')" |
1029 | 1198 | ]
|
1030 | 1199 | },
|
1031 | 1200 | {
|
1032 | 1201 | "cell_type": "markdown",
|
1033 | 1202 | "metadata": {},
|
1034 | 1203 | "source": [
|
1035 |
| - "<u>Test</u> that example from the docstring works:" |
| 1204 | + "<u>Test</u> that examples from the docstring works:" |
1036 | 1205 | ]
|
1037 | 1206 | },
|
1038 | 1207 | {
|
|
1041 | 1210 | "metadata": {},
|
1042 | 1211 | "outputs": [],
|
1043 | 1212 | "source": [
|
1044 |
| - "assert _repo_name_to_out_basename('hellogitworld') == Path('datasets/hellogitworld')\n", |
1045 |
| - "assert _repo_name_to_out_basename('hellogitworld.git', out_dir='data') == Path('data/hellogitworld')" |
| 1213 | + "assert _savefile_name('example_graph') == Path('datasets/example_graph.df_edgelist.csv.gz')\n", |
| 1214 | + "assert _repo_graph_savefile('repos/hellogitworld.git') == Path('datasets/hellogitworld-commit_graph.adjlist.txt')\n", |
| 1215 | + "assert _repo_graph_savefile('repos/hellogitworld.git', out_dir='data') == Path('data/hellogitworld-commit_graph.adjlist.txt')" |
1046 | 1216 | ]
|
1047 | 1217 | },
|
1048 | 1218 | {
|
|
1121 | 1291 | "source": [
|
1122 | 1292 | "#export\n",
|
1123 | 1293 | "def save_graph_df(df, graph_name, datasets_dir='datasets', output_format='csv.gz', overwrite=False):\n",
|
1124 |
| - " filename = _repo_name_to_edgelist_basename(graph_name, out_dir=datasets_dir) + \\\n", |
1125 |
| - " '.'+output_format\n", |
| 1294 | + " filename = _savefile_name(graph_name, out_dir=datasets_dir,\n", |
| 1295 | + " kind='df_edgelist', file_format=output_format)\n", |
| 1296 | + " print('-> filename:', filename)\n", |
1126 | 1297 | " if not overwrite and Path(filename).is_file():\n",
|
1127 | 1298 | " return\n",
|
1128 | 1299 | " if output_format == 'csv' or output_format == 'csv.gz':\n",
|
|
1141 | 1312 | " else:\n",
|
1142 | 1313 | " raise RuntimeError(\"Neither 'graph_name' parameter given, nor 'graph' has 'name' attribute\")\n",
|
1143 | 1314 | "\n",
|
1144 |
| - " print('graph_name: ', graph_name)\n", |
| 1315 | + " print('-> graph_name:', graph_name)\n", |
1145 | 1316 | " save_graph_df(df, graph_name,\n",
|
1146 | 1317 | " datasets_dir=datasets_dir, output_format=output_format, overwrite=overwrite)\n",
|
1147 | 1318 | "\n",
|
|
1154 | 1325 | "\n",
|
1155 | 1326 | "\n",
|
1156 | 1327 | "def load_graph_df(graph_name, datasets_dir='datasets', input_format='csv.gz'):\n",
|
1157 |
| - " filename = _repo_name_to_edgelist_basename(graph_name, out_dir=datasets_dir) + \\\n", |
1158 |
| - " '.'+input_format\n", |
| 1328 | + " filename = _savefile_name(graph_name, out_dir=datasets_dir,\n", |
| 1329 | + " kind='df_edgelist', file_format=input_format)\n", |
| 1330 | + " print('<- filename:', filename)\n", |
1159 | 1331 | " return load_graph_df_from_file(filename, input_format=input_format)\n",
|
1160 | 1332 | "\n"
|
1161 | 1333 | ]
|
|
1176 | 1348 | "name": "stdout",
|
1177 | 1349 | "output_type": "stream",
|
1178 | 1350 | "text": [
|
| 1351 | + "repository path: repos/hellogitworld.git\n", |
1179 | 1352 | "saving commit graph of: hellogitworld.git\n",
|
1180 |
| - "graph_name: hellogitworld.git\n", |
| 1353 | + "graph.name = hellogitworld.git\n", |
| 1354 | + "commit graph name: hellogitworld-commit_graph\n", |
| 1355 | + "testing save_graph()\n", |
| 1356 | + "-> graph_name: hellogitworld.git\n", |
| 1357 | + "-> filename: datasets\\hellogitworld.git.df_edgelist.csv.gz\n", |
| 1358 | + "testing save_graph_df()\n", |
| 1359 | + "-> filename: datasets\\hellogitworld-commit_graph.df_edgelist.csv.gz\n", |
1181 | 1360 | "there should be appropriately named file in the list below:\n"
|
1182 | 1361 | ]
|
1183 | 1362 | },
|
1184 | 1363 | {
|
1185 | 1364 | "data": {
|
1186 | 1365 | "text/plain": [
|
1187 |
| - "['hellogitworld-commit_graph.df_edgelist.csv.gz 523']" |
| 1366 | + "['hellogitworld-commit_graph.adjlist.txt 868',\n", |
| 1367 | + " 'hellogitworld-commit_graph.df_edgelist.csv.gz 523',\n", |
| 1368 | + " 'hellogitworld.git.df_edgelist.csv.gz 514']" |
1188 | 1369 | ]
|
1189 | 1370 | },
|
1190 | 1371 | "execution_count": null,
|
|
1193 | 1374 | }
|
1194 | 1375 | ],
|
1195 | 1376 | "source": [
|
| 1377 | + "print('repository path: {}'.format(repo_path))\n", |
1196 | 1378 | "print('saving commit graph of: {}'.format(repo_name))\n",
|
| 1379 | + "print('graph.name = {}'.format(repo_graph.name))\n", |
| 1380 | + "print('commit graph name: {}'.format(_repo_graph_name(repo_path)))\n", |
| 1381 | + "print('testing save_graph()')\n", |
1197 | 1382 | "save_graph(repo_graph)\n",
|
1198 |
| - "save_graph_df(repo_graph_df, graph_name=repo_name)\n", |
| 1383 | + "print('testing save_graph_df()')\n", |
| 1384 | + "save_graph_df(repo_graph_df, graph_name=_repo_graph_name(repo_path))\n", |
1199 | 1385 | "print('there should be appropriately named file in the list below:')\n",
|
1200 | 1386 | "[\"{name:<50} {size:>7}\".format(name=p.name,size=p.stat().st_size) for p in Path(\"datasets\").glob(\"hellogitworld*\")]"
|
1201 | 1387 | ]
|
|
1210 | 1396 | "output_type": "stream",
|
1211 | 1397 | "text": [
|
1212 | 1398 | "restoring commit graph of: hellogitworld.git\n",
|
| 1399 | + "<- filename: datasets\\hellogitworld.git.df_edgelist.csv.gz\n", |
1213 | 1400 | "ok - dataset and restored dataset are equal\n"
|
1214 | 1401 | ]
|
1215 | 1402 | }
|
|
1265 | 1452 | "- graph\n",
|
1266 | 1453 | "- lvl\n",
|
1267 | 1454 | "- mpi_ext\n",
|
1268 |
| - "reachability labels should be computed for all nodes\n" |
| 1455 | + "ok - graph has both \"lvl\" and \"mpi_ext\" attributes\n", |
| 1456 | + "reachability labels should be computed for all nodes\n", |
| 1457 | + "ok - both lvl and mpi_ext keys are all 55 graph nodes\n" |
1269 | 1458 | ]
|
1270 | 1459 | }
|
1271 | 1460 | ],
|
|
1279 | 1468 | " print('- {:s}'.format(attr))\n",
|
1280 | 1469 | "assert hasattr(repo_graph, 'lvl')\n",
|
1281 | 1470 | "assert hasattr(repo_graph, 'mpi_ext')\n",
|
| 1471 | + "print('ok - graph has both \"{}\" and \"{}\" attributes'.format('lvl', 'mpi_ext'))\n", |
1282 | 1472 | " \n",
|
1283 | 1473 | "print('reachability labels should be computed for all nodes')\n",
|
1284 | 1474 | "assert set(repo_graph.lvl.keys()) == set(repo_graph.nodes)\n",
|
1285 |
| - "assert set(repo_graph.mpi_ext.keys()) == set(repo_graph.nodes)" |
| 1475 | + "assert set(repo_graph.mpi_ext.keys()) == set(repo_graph.nodes)\n", |
| 1476 | + "print('ok - both lvl and mpi_ext keys are all {} graph nodes'.format(len(repo_graph.nodes)))" |
1286 | 1477 | ]
|
1287 | 1478 | },
|
1288 | 1479 | {
|
|
0 commit comments