diff --git a/notebooks/Analyzing_Hacker_News_with_Six_Language_Understanding_Methods.ipynb b/notebooks/Analyzing_Hacker_News_with_Six_Language_Understanding_Methods.ipynb index 27ce0335..ca080c1e 100644 --- a/notebooks/Analyzing_Hacker_News_with_Six_Language_Understanding_Methods.ipynb +++ b/notebooks/Analyzing_Hacker_News_with_Six_Language_Understanding_Methods.ipynb @@ -64,7 +64,7 @@ "from annoy import AnnoyIndex\n", "import warnings\n", "from sklearn.cluster import KMeans\n", - "from bertopic._ctfidf import ClassTFIDF\n", + "from bertopic.vectorizers._ctfidf import ClassTfidfTransformer\n", "from sklearn.feature_extraction.text import CountVectorizer\n", "\n", "warnings.filterwarnings('ignore')\n", @@ -889,8 +889,8 @@ "# We'll need to embed the query using the same model that embedded the archive\n", "# so the query and archive are using the same embedding space.\n", "query_embed = co.embed(texts=[query],\n", - " model=\"small-20220425\", \n", - " truncate=\"RIGHT\").embeddings\n", + " model=\"small\", \n", + " truncate=\"LEFT\").embeddings\n", "\n", "# Retrieve the nearest neighbors\n", "similar_item_ids = search_index.get_nns_by_vector(query_embed[0],10,\n", @@ -1086,8 +1086,8 @@ "documents_per_topic = documents.groupby(['Topic'], as_index=False).agg({'Document': ' '.join})\n", "count_vectorizer = CountVectorizer(stop_words=\"english\").fit(documents_per_topic.Document)\n", "count = count_vectorizer.transform(documents_per_topic.Document)\n", - "words = count_vectorizer.get_feature_names()\n", - "ctfidf = ClassTFIDF().fit_transform(count).toarray()\n", + "words = count_vectorizer.get_feature_names_out()\n", + "ctfidf = ClassTfidfTransformer().fit_transform(count).toarray()\n", "words_per_class = {label: [words[index] for index in ctfidf[label].argsort()[-10:]] for label in documents_per_topic.Topic}\n", "df['cluster'] = classes\n", "df['keywords'] = df['cluster'].map(lambda topic_num: \", \".join(np.array(words_per_class[topic_num])[:]))" @@ -1282,4 +1282,4 @@ }, "nbformat": 4, "nbformat_minor": 4 -} \ No newline at end of file +}