Skip to content

Commit a226f59

Browse files
authored
Merge pull request #30 from alan-turing-institute/licences
Add code for getting licence info
2 parents 2982d12 + 9bae6fc commit a226f59

File tree

1 file changed

+110
-0
lines changed

1 file changed

+110
-0
lines changed

src/github_analyser/licences.py

Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
from __future__ import annotations
2+
3+
from pathlib import Path
4+
5+
import pandas as pd
6+
7+
from github_analyser.utils import camel_to_snake, request_github_graphql
8+
9+
10+
def _get_licence_query(org_name: str, repo_name: str) -> str:
11+
return f"""
12+
query {{
13+
repository(owner: "{org_name}", name: "{repo_name}") {{
14+
id
15+
url
16+
licenseInfo {{
17+
id
18+
name
19+
spdxId
20+
}}
21+
}}
22+
}}
23+
"""
24+
25+
26+
def get_licence(
27+
org_name: str,
28+
repo_name: str,
29+
) -> pd.DataFrame:
30+
"""Fetch info about licences from a GitHub repository.
31+
32+
Args:
33+
org_name: The owner of the repository.
34+
repo_name: The name of the repository.
35+
36+
Returns:
37+
A pandas Series containing the repository ID, licence name, and SPDX ID.
38+
"""
39+
query = _get_licence_query(org_name, repo_name)
40+
41+
response = request_github_graphql({"query": query})
42+
# extract repo id from the first response
43+
repo_id = None
44+
if response:
45+
repo_id = response["data"]["repository"]["id"]
46+
repo_url = response["data"]["repository"]["url"]
47+
48+
licence_info = response["data"]["repository"]["licenseInfo"]
49+
50+
if licence_info is None:
51+
return pd.Series(
52+
{
53+
"repo_name": repo_name,
54+
"repo_url": repo_url,
55+
"repo_id": repo_id,
56+
"name": None,
57+
"spdx_id": None,
58+
}
59+
)
60+
61+
name = licence_info.get("name", "")
62+
spdx_id = licence_info.get("spdxId", "")
63+
return pd.Series(
64+
{
65+
"repo_name": repo_name,
66+
"repo_url": repo_url,
67+
"repo_id": repo_id,
68+
"name": name,
69+
"spdx_id": spdx_id,
70+
}
71+
)
72+
73+
74+
def get_licences(
75+
org_name: str,
76+
repo_names: list[str],
77+
save: bool | str = False,
78+
) -> pd.DataFrame:
79+
"""Get information about licences for multiple repositories within an organization.
80+
81+
Args:
82+
org_name: The owner of the repositories.
83+
repo_names: A list of repository names.
84+
save (bool | str, optional): If True, save the data to "data/licences.csv".
85+
If a string, save to that path. Defaults to False.
86+
87+
Returns:
88+
A pandas DataFrame containing the repository IDs, licence names, and SPDX IDs.
89+
90+
"""
91+
if not isinstance(repo_names, list):
92+
msg = "`repo_names` must be a list of repository names."
93+
raise ValueError(msg)
94+
95+
data = []
96+
for repo_name in repo_names:
97+
series = get_licence(org_name, repo_name)
98+
data.append(series)
99+
100+
df = pd.DataFrame(data)
101+
df.rename(columns=camel_to_snake, inplace=True)
102+
103+
if save:
104+
if save is True:
105+
if not Path("data").exists():
106+
Path("data").mkdir(parents=True, exist_ok=True)
107+
save = "data/licences.csv"
108+
df.to_csv(save, index=False)
109+
110+
return df

0 commit comments

Comments
 (0)