diff --git a/projects/initial-data/main.js b/projects/initial-data/main.js index 4ba0702..70c163e 100644 --- a/projects/initial-data/main.js +++ b/projects/initial-data/main.js @@ -1,147 +1,75 @@ import { Octokit } from 'octokit'; import { getInput } from './setup.js'; - import { DataRecorder } from './dataRecorder.js'; const CSV_FILE_NAME = `initialTopicRepoData-${Date.now()}.csv`; -export async function fetchRepoCreationDate(octokit, owner, repo) { - console.log(`Fetching creation date for repository: ${owner}/${repo}`); - const response = await octokit.request('GET /repos/{owner}/{repo}', { - owner, - repo, - }); - return Date.parse(response.data.created_at); -} - -async function fetchFirstCommitDate(octokit, owner, repo) { - console.log(`Fetching first commit date for repository: ${owner}/${repo}`); - try { - const response = await octokit.request( - 'GET /repos/{owner}/{repo}/commits', - { - owner, - repo, - per_page: 1, - }, - ); - - const lastPageUrl = response.headers.link?.match( - /<([^>]+)>;\s*rel="last"/, - )?.[1]; - - if (!lastPageUrl) { - if (response.data.length > 0) { - response.data[0].commit.author.date; - } else { - throw new Error(`No commits found for ${owner}/${repo}`); - } - } - - const lastPageResponse = await octokit.request(lastPageUrl); - - if (lastPageResponse.data.length > 0) { - return Date.parse(lastPageResponse.data[0].commit.author.date); - } else { - throw new Error(`No commits found ${owner}/${repo}`); - } - } catch (err) { - throw new Error(`Could not find any commits for ${owner}/${repo}`); - } -} - -async function fetchRepoTopics(octokit, owner, repo) { - console.log(`Fetching topics for repository: ${owner}/${repo}`); - const response = await octokit.request('GET /repos/{owner}/{repo}/topics', { - owner, - repo, - }); - return response.data.names; -} - -async function fetchFirstReleaseDate(octokit, owner, repo) { - console.log(`Fetching first release date for repository: ${owner}/${repo}`); - try { - const response = await octokit.request( - 'GET /repos/{owner}/{repo}/releases', - { - owner, - repo, - per_page: 1, - }, - ); - const lastPageUrl = response.headers.link?.match( - /<([^>]+)>;\s*rel="last"/, - )?.[1]; - - if (!lastPageUrl) { - if (response.data.length > 0) { - response.data[0].created_at; - } else { - throw new Error(`No releases found for ${owner}/${repo}`); - } - } - - const lastPageResponse = await octokit.request(lastPageUrl); - if (lastPageResponse.data.length > 0) { - return Date.parse(lastPageResponse.data[0].created_at); - } else { - throw new Error(`No releases found for ${owner}/${repo}`); +async function fetchRepositoriesWithTopic(octokit, topic, numRepos) { + console.log(`Fetching repositories with topic: ${topic}`); + + const perPage = 100; //allow max 100 items/page + const maxResults = numRepos === -1 ? 1000 : Math.min(numRepos, 1000); //Cap at 1000 + const pages = Math.ceil(maxResults / perPage); + + let allRepos = []; + + for (let page = 1; page <= pages; page++) { + console.log(`Fetching page ${page} of ${pages} for topic: ${topic}`); + + const response = await octokit.rest.search.repos({ + q: `topic:${topic}`, + per_page: perPage, + page: page, + sort: 'updated', + order: 'desc' + }); + + const repos = response.data.items; + allRepos = [...allRepos, ...repos]; + + // If received fewer items than requested, we've reached the end + if (repos.length < perPage) break; + + // If reached our target number of repos, stop + if (numRepos !== -1 && allRepos.length >= numRepos) { + allRepos = allRepos.slice(0, numRepos); + break; } - } catch (err) { - throw new Error(`Unable to get releases for ${owner}/${repo}`); + + // Rate limits + await new Promise(resolve => setTimeout(resolve, 1000)); } + + return allRepos; } -export async function processRepository(octokit, owner, repo) { - console.log(`Processing repository: ${owner}/${repo}`); - const githubRepoURL = `https://github.com/${owner}/${repo}`; - - const creationDate = await fetchRepoCreationDate(octokit, owner, repo); - let firstReleaseDate; - try { - firstReleaseDate = await fetchFirstReleaseDate(octokit, owner, repo); - } catch (err) { - throw new Error(`Unable to get releases for ${owner}/${repo}`); - } - const repoTopics = await fetchRepoTopics(octokit, owner, repo); - - let firstCommitDate; - try { - firstCommitDate = await fetchFirstCommitDate(octokit, owner, repo); - } catch (err) { - throw new Error(`Error trying to find first commit for ${owner}/${repo}`); - } - console.log({ firstReleaseDate }); - - if (firstReleaseDate === null) { - console.log(`First release date: of ${githubRepoURL} unknown`); - } - - if (firstCommitDate === null) { - console.log(`First commit date: of ${githubRepoURL} unknown`); - } - +function extractRepositoryData(repo) { + console.log(`Extracting data for repository: ${repo.full_name}`); + + const creationDate = Date.parse(repo.created_at); + + // Topics are directly available in the search response + const repoTopics = repo.topics || []; + const singleRowData = { - repository: `${owner}/${repo}`, + repository: repo.full_name, repoTopics: `"${repoTopics.join(', ')}"`, - date_first_commit: firstCommitDate, + date_first_commit: null, // skipping this as per requirements creation: creationDate, - date_first_release: firstReleaseDate, // firstReleaseDate is null if no releases. allowed it because it is appropriate + date_first_release: null, // skipping this as per requirements }; - + return singleRowData; } async function main(token, topic, numRepos) { const octokit = new Octokit({ auth: token }); - - const iterator = octokit.paginate.iterator(octokit.rest.search.repos, { - q: `topic:${topic}`, - per_page: 100, - }); - let processedRepos = 0; - + + // Fetch repositories with the specified topic + const repositories = await fetchRepositoriesWithTopic(octokit, topic, numRepos); + console.log(`Found ${repositories.length} repositories with topic: ${topic}`); + + // Set up CSV recorder const csvColumns = [ 'repo', 'repo_topics', @@ -150,26 +78,21 @@ async function main(token, topic, numRepos) { 'date_first_release', ]; const dataRecorder = new DataRecorder(`./data/${CSV_FILE_NAME}`, csvColumns); - - for await (const iteration of iterator) { - const data = iteration.data; - for (const repo of data) { - try { - if (numRepos !== -1 && processedRepos >= numRepos) break; - const dataRow = await processRepository( - octokit, - repo.owner.login, - repo.name, - ); - - dataRecorder.appendToCSV(Object.values(dataRow)); - processedRepos++; - console.log(`processed ${processedRepos}`); - } catch (err) { - console.error(err); - } + + // Process each repository + let processedRepos = 0; + for (const repo of repositories) { + try { + const dataRow = extractRepositoryData(repo); + dataRecorder.appendToCSV(Object.values(dataRow)); + processedRepos++; + console.log(`Processed ${processedRepos}/${repositories.length}`); + } catch (err) { + console.error(`Error processing repository ${repo.full_name}:`, err); } } + + console.log(`Completed processing ${processedRepos} repositories`); } export function runMain() { @@ -180,3 +103,6 @@ export function runMain() { main(token, topic, numRepos); } + +// testing purposes +export { fetchRepositoriesWithTopic, extractRepositoryData }; diff --git a/projects/initial-data/processRespository.test.js b/projects/initial-data/processRespository.test.js index 23fa97f..4541b06 100644 --- a/projects/initial-data/processRespository.test.js +++ b/projects/initial-data/processRespository.test.js @@ -2,7 +2,7 @@ import { tmpdir } from 'node:os'; import { sep } from 'node:path'; import fs from 'fs'; -import { fetchRepoCreationDate } from './main.js'; +import { fetchRepositoriesWithTopic, extractRepositoryData } from './main.js'; import { getInput } from './setup.js'; import { Octokit } from 'octokit'; import { DataRecorder } from './dataRecorder.js'; @@ -44,10 +44,27 @@ describe('Basic tests', () => { }); }); - it('Simple use of Octokit calls the GitHub API', async () => { - let data; - data = await fetchRepoCreationDate(octokit, 'octocat', 'hello-world'); - expect(data).toEqual(1550934514000); + it('fetchRepositoriesWithTopic fetches repositories with the specified topic', async () => { + const repos = await fetchRepositoriesWithTopic(octokit, 'json-schema', 3); + expect(repos.length).toBeGreaterThan(0); + expect(repos.length).toBeLessThanOrEqual(3); + expect(repos[0].topics).toContain('json-schema'); + }); + + it('extractRepositoryData correctly extracts data from a repository', () => { + const mockRepo = { + full_name: 'owner/repo', + created_at: '2020-01-01T00:00:00Z', + topics: ['json-schema', 'validation'] + }; + + const data = extractRepositoryData(mockRepo); + + expect(data.repository).toBe('owner/repo'); + expect(data.repoTopics).toBe('"json-schema, validation"'); + expect(data.creation).toBe(Date.parse('2020-01-01T00:00:00Z')); + expect(data.date_first_commit).toBeNull(); + expect(data.date_first_release).toBeNull(); }); it('DataRecorder writes JSON data to csv file', async () => {