Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
119 changes: 119 additions & 0 deletions enhanced_course_table_with_ai.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
import argparse
from typing import List
import pandas as pd
from pandas.core.series import Series
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Constants for column names
TOPIC_COLUMN_NAME = "topic"
DIFFICULTY_COLUMN_NAME = "difficulty"
PRICE_COLUMN_NAME = "price"
RELEASE_YEAR_COLUMN_NAME = "release_year"
URL_COLUMN_NAME = "url"
LABEL_COLUMN_NAME = "label"
AUTHOR_COLUMN_NAME = "author"
FORMAT_COLUMN_NAME = "format"
DESCRIPTION_COLUMN_NAME = "description"

# Tokens for auto-generating content in README
AUTOGENERATED_COURSES_TABLE_TOKEN = "<!--- AUTOGENERATED_COURSES_TABLE -->"

# Warning header for auto-generated content
WARNING_HEADER = [
"<!---",
" WARNING: DO NOT EDIT THIS TABLE MANUALLY. IT IS AUTOMATICALLY GENERATED.",
" HEAD OVER TO CONTRIBUTING.MD FOR MORE DETAILS ON HOW TO MAKE CHANGES PROPERLY.",
"-->"
]

# Table header for the courses table
TABLE_HEADER = [
"| **topic** | **course format** | **difficulty** | **release year** | **price** | **course** | **related topics** |",
"|:---------:|:-----------------:|:--------------:|:----------------:|:---------:|:----------:|:------------------:|"
]

# Difficulty mapping to visual indicators
DIFFICULTY_MAP = {
1: "🟩⬜⬜",
2: "🟩🟩⬜",
3: "🟩🟩🟩"
}

# Function to read lines from a file
def read_lines_from_file(path: str) -> List[str]:
with open(path) as file:
return [line.rstrip() for line in file]

# Function to save lines to a file
def save_lines_to_file(path: str, lines: List[str]) -> None:
with open(path, "w") as f:
for line in lines:
f.write("%s\n" % line)

# Function to format each entry in the table
def format_entry(entry: Series, related_topics: List[str]) -> str:
topic = entry.loc[TOPIC_COLUMN_NAME]
difficulty = DIFFICULTY_MAP[entry.loc[DIFFICULTY_COLUMN_NAME]]
release_year = entry.loc[RELEASE_YEAR_COLUMN_NAME]
price = entry.loc[PRICE_COLUMN_NAME]
course_format = entry.loc[FORMAT_COLUMN_NAME] # Renamed variable
url = entry.loc[URL_COLUMN_NAME]
label = entry.loc[LABEL_COLUMN_NAME]
author = entry.loc[AUTHOR_COLUMN_NAME]
related = ", ".join(related_topics)
return f"| {topic} | {course_format} | {difficulty} | {release_year} | {price} | [{label}]({url}) by {author} | {related} |"

# Function to load table entries and add related topics using AI
def load_table_entries(path: str) -> List[str]:
df = pd.read_csv(path)
df.columns = df.columns.str.strip()

# Use TF-IDF to find similar topics
descriptions = df[DESCRIPTION_COLUMN_NAME].fillna('')
vectorizer = TfidfVectorizer().fit_transform(descriptions)
vectors = vectorizer.toarray()

related_topics_list = []
for i, vector in enumerate(vectors):
cosine_similarities = cosine_similarity([vector], vectors).flatten()
related_indices = cosine_similarities.argsort()[-3:][::-1]
related_topics = df.iloc[related_indices][TOPIC_COLUMN_NAME].tolist()
related_topics_list.append(related_topics)

return [
format_entry(row, related_topics)
for _, row, related_topics
in zip(df.iterrows(), related_topics_list)
]

# Function to search lines containing a specific token
def search_lines_with_token(lines: List[str], token: str) -> List[int]:
result = []
for line_index, line in enumerate(lines):
if token in line:
result.append(line_index)
return result

# Function to inject the markdown table into README
def inject_markdown_table_into_readme(readme_lines: List[str], table_lines: List[str]) -> List[str]:
lines_with_token_indexes = search_lines_with_token(lines=readme_lines, token=AUTOGENERATED_COURSES_TABLE_TOKEN)
if len(lines_with_token_indexes) != 2:
raise ValueError(f"Please inject two {AUTOGENERATED_COURSES_TABLE_TOKEN} "
f"tokens to signal start and end of autogenerated table.")

[table_start_line_index, table_end_line_index] = lines_with_token_indexes
return readme_lines[:table_start_line_index + 1] + table_lines + readme_lines[table_end_line_index:]

# Main function
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('-d', '--data_path', default='automation/data.csv')
parser.add_argument('-r', '--readme_path', default='README.md')
args = parser.parse_args()

table_lines = load_table_entries(path=args.data_path)
table_lines = WARNING_HEADER + TABLE_HEADER + table_lines
readme_lines = read_lines_from_file(path=args.readme_path)
readme_lines = inject_markdown_table_into_readme(readme_lines=readme_lines, table_lines=table_lines)
save_lines_to_file(path=args.readme_path, lines=readme_lines)