diff --git a/enhanced_course_table_with_ai.py b/enhanced_course_table_with_ai.py new file mode 100644 index 0000000..83efb55 --- /dev/null +++ b/enhanced_course_table_with_ai.py @@ -0,0 +1,119 @@ +import argparse +from typing import List +import pandas as pd +from pandas.core.series import Series +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.metrics.pairwise import cosine_similarity + +# Constants for column names +TOPIC_COLUMN_NAME = "topic" +DIFFICULTY_COLUMN_NAME = "difficulty" +PRICE_COLUMN_NAME = "price" +RELEASE_YEAR_COLUMN_NAME = "release_year" +URL_COLUMN_NAME = "url" +LABEL_COLUMN_NAME = "label" +AUTHOR_COLUMN_NAME = "author" +FORMAT_COLUMN_NAME = "format" +DESCRIPTION_COLUMN_NAME = "description" + +# Tokens for auto-generating content in README +AUTOGENERATED_COURSES_TABLE_TOKEN = "" + +# Warning header for auto-generated content +WARNING_HEADER = [ + "" +] + +# Table header for the courses table +TABLE_HEADER = [ + "| **topic** | **course format** | **difficulty** | **release year** | **price** | **course** | **related topics** |", + "|:---------:|:-----------------:|:--------------:|:----------------:|:---------:|:----------:|:------------------:|" +] + +# Difficulty mapping to visual indicators +DIFFICULTY_MAP = { + 1: "🟩⬜⬜", + 2: "🟩🟩⬜", + 3: "🟩🟩🟩" +} + +# Function to read lines from a file +def read_lines_from_file(path: str) -> List[str]: + with open(path) as file: + return [line.rstrip() for line in file] + +# Function to save lines to a file +def save_lines_to_file(path: str, lines: List[str]) -> None: + with open(path, "w") as f: + for line in lines: + f.write("%s\n" % line) + +# Function to format each entry in the table +def format_entry(entry: Series, related_topics: List[str]) -> str: + topic = entry.loc[TOPIC_COLUMN_NAME] + difficulty = DIFFICULTY_MAP[entry.loc[DIFFICULTY_COLUMN_NAME]] + release_year = entry.loc[RELEASE_YEAR_COLUMN_NAME] + price = entry.loc[PRICE_COLUMN_NAME] + course_format = entry.loc[FORMAT_COLUMN_NAME] # Renamed variable + url = entry.loc[URL_COLUMN_NAME] + label = entry.loc[LABEL_COLUMN_NAME] + author = entry.loc[AUTHOR_COLUMN_NAME] + related = ", ".join(related_topics) + return f"| {topic} | {course_format} | {difficulty} | {release_year} | {price} | [{label}]({url}) by {author} | {related} |" + +# Function to load table entries and add related topics using AI +def load_table_entries(path: str) -> List[str]: + df = pd.read_csv(path) + df.columns = df.columns.str.strip() + + # Use TF-IDF to find similar topics + descriptions = df[DESCRIPTION_COLUMN_NAME].fillna('') + vectorizer = TfidfVectorizer().fit_transform(descriptions) + vectors = vectorizer.toarray() + + related_topics_list = [] + for i, vector in enumerate(vectors): + cosine_similarities = cosine_similarity([vector], vectors).flatten() + related_indices = cosine_similarities.argsort()[-3:][::-1] + related_topics = df.iloc[related_indices][TOPIC_COLUMN_NAME].tolist() + related_topics_list.append(related_topics) + + return [ + format_entry(row, related_topics) + for _, row, related_topics + in zip(df.iterrows(), related_topics_list) + ] + +# Function to search lines containing a specific token +def search_lines_with_token(lines: List[str], token: str) -> List[int]: + result = [] + for line_index, line in enumerate(lines): + if token in line: + result.append(line_index) + return result + +# Function to inject the markdown table into README +def inject_markdown_table_into_readme(readme_lines: List[str], table_lines: List[str]) -> List[str]: + lines_with_token_indexes = search_lines_with_token(lines=readme_lines, token=AUTOGENERATED_COURSES_TABLE_TOKEN) + if len(lines_with_token_indexes) != 2: + raise ValueError(f"Please inject two {AUTOGENERATED_COURSES_TABLE_TOKEN} " + f"tokens to signal start and end of autogenerated table.") + + [table_start_line_index, table_end_line_index] = lines_with_token_indexes + return readme_lines[:table_start_line_index + 1] + table_lines + readme_lines[table_end_line_index:] + +# Main function +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument('-d', '--data_path', default='automation/data.csv') + parser.add_argument('-r', '--readme_path', default='README.md') + args = parser.parse_args() + + table_lines = load_table_entries(path=args.data_path) + table_lines = WARNING_HEADER + TABLE_HEADER + table_lines + readme_lines = read_lines_from_file(path=args.readme_path) + readme_lines = inject_markdown_table_into_readme(readme_lines=readme_lines, table_lines=table_lines) + save_lines_to_file(path=args.readme_path, lines=readme_lines)