Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
84 changes: 44 additions & 40 deletions src/fetchers/top-languages-fetcher.js
Original file line number Diff line number Diff line change
Expand Up @@ -111,51 +111,55 @@ const fetchTopLanguages = async (
// filter out repositories to be hidden
repoNodes = repoNodes
.sort((a, b) => b.size - a.size)
.filter((name) => !repoToHide[name.name]);

let repoCount = 0;

repoNodes = repoNodes
.filter((node) => node.languages.edges.length > 0)
// flatten the list of language nodes
.reduce((acc, curr) => curr.languages.edges.concat(acc), [])
.reduce((acc, prev) => {
// get the size of the language (bytes)
let langSize = prev.size;

// if we already have the language in the accumulator
// & the current language name is same as previous name
// add the size to the language size and increase repoCount.
if (acc[prev.node.name] && prev.node.name === acc[prev.node.name].name) {
langSize = prev.size + acc[prev.node.name].size;
repoCount += 1;
} else {
// reset repoCount to 1
// language must exist in at least one repo to be detected
repoCount = 1;
.filter((name) => !repoToHide[name.name])
.filter((node) => node.languages.edges.length > 0);

// New normalized statistics logic: each repository contributes equal weight
const normalizedLanguages = {};
let totalRepoCount = 0;

// Process each repository, normalize its language distribution
repoNodes.forEach((repo) => {
// Calculate total bytes for this repository
const repoTotalSize = repo.languages.edges.reduce((sum, edge) => sum + edge.size, 0);

if (repoTotalSize === 0) return; // Skip empty repositories

totalRepoCount += 1;

// Calculate normalized proportion for each language in this repository
repo.languages.edges.forEach((edge) => {
const langName = edge.node.name;
const langColor = edge.node.color;
const normalizedSize = edge.size / repoTotalSize; // Language proportion in current repository

if (!normalizedLanguages[langName]) {
normalizedLanguages[langName] = {
name: langName,
color: langColor,
size: 0,
count: 0,
};
}
return {
...acc,
[prev.node.name]: {
name: prev.node.name,
color: prev.node.color,
size: langSize,
count: repoCount,
},
};
}, {});

// Accumulate normalized proportions
normalizedLanguages[langName].size += normalizedSize;
normalizedLanguages[langName].count += 1;
});
});

Object.keys(repoNodes).forEach((name) => {
// comparison index calculation
repoNodes[name].size =
Math.pow(repoNodes[name].size, size_weight) *
Math.pow(repoNodes[name].count, count_weight);
// Divide accumulated proportions by total repository count to get average proportions
Object.keys(normalizedLanguages).forEach((langName) => {
const lang = normalizedLanguages[langName];
// Average proportion of this language across all repositories, then apply weights
const avgProportion = lang.size / totalRepoCount;
lang.size = Math.pow(avgProportion, size_weight) * Math.pow(lang.count, count_weight);
});

const topLangs = Object.keys(repoNodes)
.sort((a, b) => repoNodes[b].size - repoNodes[a].size)
const topLangs = Object.keys(normalizedLanguages)
.sort((a, b) => normalizedLanguages[b].size - normalizedLanguages[a].size)
.reduce((result, key) => {
result[key] = repoNodes[key];
result[key] = normalizedLanguages[key];
return result;
}, {});

Expand Down
139 changes: 139 additions & 0 deletions test-normalized-example.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
/**
* Test example for normalized language statistics
*
* This example demonstrates how the new normalized logic works
* compared to the old byte-sum approach.
*/

// Mock data representing 3 repositories with different language distributions
const mockRepositories = [
{
name: "large-c-project",
languages: {
edges: [
{ size: 9000, node: { name: "C", color: "#555555" } },
{ size: 1000, node: { name: "JavaScript", color: "#f1e05a" } }
]
}
},
{
name: "web-app",
languages: {
edges: [
{ size: 500, node: { name: "JavaScript", color: "#f1e05a" } },
{ size: 300, node: { name: "Python", color: "#3572A5" } },
{ size: 200, node: { name: "CSS", color: "#563d7c" } }
]
}
},
{
name: "python-script",
languages: {
edges: [
{ size: 800, node: { name: "Python", color: "#3572A5" } },
{ size: 200, node: { name: "JavaScript", color: "#f1e05a" } }
]
}
}
];

// Old logic: Direct byte sum
function calculateOldLogic(repos) {
const languages = {};

repos.forEach(repo => {
repo.languages.edges.forEach(edge => {
const name = edge.node.name;
if (!languages[name]) {
languages[name] = { name, color: edge.node.color, size: 0 };
}
languages[name].size += edge.size;
});
});

const total = Object.values(languages).reduce((sum, lang) => sum + lang.size, 0);

return Object.fromEntries(
Object.entries(languages)
.sort(([,a], [,b]) => b.size - a.size)
.map(([name, lang]) => [name, {
...lang,
percentage: ((lang.size / total) * 100).toFixed(1)
}])
);
}

// New logic: Normalized per repository
function calculateNewLogic(repos) {
const normalizedLanguages = {};
let totalRepoCount = 0;

repos.forEach(repo => {
const repoTotalSize = repo.languages.edges.reduce((sum, edge) => sum + edge.size, 0);
if (repoTotalSize === 0) return;

totalRepoCount += 1;

repo.languages.edges.forEach(edge => {
const name = edge.node.name;
const normalizedSize = edge.size / repoTotalSize;

if (!normalizedLanguages[name]) {
normalizedLanguages[name] = {
name,
color: edge.node.color,
size: 0,
count: 0
};
}

normalizedLanguages[name].size += normalizedSize;
normalizedLanguages[name].count += 1;
});
});

// Calculate average proportions
Object.keys(normalizedLanguages).forEach(name => {
const lang = normalizedLanguages[name];
lang.size = lang.size / totalRepoCount;
});

return Object.fromEntries(
Object.entries(normalizedLanguages)
.sort(([,a], [,b]) => b.size - a.size)
.map(([name, lang]) => [name, {
...lang,
percentage: (lang.size * 100).toFixed(1)
}])
);
}

// Run comparison
console.log("=== Language Statistics Comparison ===\n");

console.log("Repository Data:");
mockRepositories.forEach((repo, i) => {
const total = repo.languages.edges.reduce((sum, edge) => sum + edge.size, 0);
console.log(`${i + 1}. ${repo.name} (${total} bytes total):`);
repo.languages.edges.forEach(edge => {
const percent = ((edge.size / total) * 100).toFixed(1);
console.log(` - ${edge.node.name}: ${edge.size} bytes (${percent}%)`);
});
});

console.log("\n--- OLD LOGIC (Direct byte sum) ---");
const oldResults = calculateOldLogic(mockRepositories);
Object.entries(oldResults).forEach(([name, lang]) => {
console.log(`${name}: ${lang.size} bytes (${lang.percentage}%)`);
});

console.log("\n--- NEW LOGIC (Normalized per repository) ---");
const newResults = calculateNewLogic(mockRepositories);
Object.entries(newResults).forEach(([name, lang]) => {
console.log(`${name}: ${lang.percentage}% (appears in ${lang.count} repos)`);
});

console.log("\n=== Analysis ===");
console.log("Old logic: C dominates with 81.8% due to one large repository");
console.log("New logic: JavaScript leads with 43.3% as it appears in all 3 repos");
console.log("The new approach better represents overall language diversity!");