@@ -325,10 +325,14 @@ <h1>Utility Functions (Text Data Processing)<a class="headerlink" href="#utility
325325< span class ="kn "> import</ span > < span class ="nn "> matplotlib.pyplot</ span > < span class ="k "> as</ span > < span class ="nn "> plt</ span >
326326< span class ="kn "> from</ span > < span class ="nn "> tqdm.notebook</ span > < span class ="kn "> import</ span > < span class ="n "> tqdm</ span >
327327< span class ="kn "> from</ span > < span class ="nn "> wordcloud</ span > < span class ="kn "> import</ span > < span class ="n "> WordCloud</ span >
328+ < span class ="kn "> from</ span > < span class ="nn "> spacy.lang.en.stop_words</ span > < span class ="kn "> import</ span > < span class ="n "> STOP_WORDS</ span >
329+ < span class ="kn "> from</ span > < span class ="nn "> spacy.tokens</ span > < span class ="kn "> import</ span > < span class ="n "> Doc</ span >
330+ < span class ="kn "> from</ span > < span class ="nn "> spacy.language</ span > < span class ="kn "> import</ span > < span class ="n "> Language</ span >
328331
329332< span class ="c1 "> # Add tqdm functions to pandas.</ span >
330333< span class ="n "> tqdm</ span > < span class ="o "> .</ span > < span class ="n "> pandas</ span > < span class ="p "> ()</ span >
331334
335+
332336< span class ="k "> def</ span > < span class ="nf "> check_answer_df</ span > < span class ="p "> (</ span > < span class ="n "> df_result</ span > < span class ="p "> ,</ span > < span class ="n "> df_answer</ span > < span class ="p "> ,</ span > < span class ="n "> n</ span > < span class ="o "> =</ span > < span class ="mi "> 1</ span > < span class ="p "> ):</ span >
333337 < span class ="sd "> """</ span >
334338< span class ="sd "> This function checks if two output dataframes are the same.</ span >
@@ -462,6 +466,12 @@ <h1>Utility Functions (Text Data Processing)<a class="headerlink" href="#utility
462466 < span class ="n "> plt</ span > < span class ="o "> .</ span > < span class ="n "> show</ span > < span class ="p "> ()</ span >
463467
464468
469+ < span class ="nd "> @Language</ span > < span class ="o "> .</ span > < span class ="n "> component</ span > < span class ="p "> (</ span > < span class ="s2 "> "lowercase_text"</ span > < span class ="p "> )</ span >
470+ < span class ="k "> def</ span > < span class ="nf "> lowercase_text</ span > < span class ="p "> (</ span > < span class ="n "> doc</ span > < span class ="p "> ):</ span >
471+ < span class ="c1 "> # Create a new Doc with the lowercase text, using the same Vocab</ span >
472+ < span class ="k "> return</ span > < span class ="n "> Doc</ span > < span class ="p "> (</ span > < span class ="n "> doc</ span > < span class ="o "> .</ span > < span class ="n "> vocab</ span > < span class ="p "> ,</ span > < span class ="n "> words</ span > < span class ="o "> =</ span > < span class ="p "> [</ span > < span class ="n "> t</ span > < span class ="o "> .</ span > < span class ="n "> text</ span > < span class ="o "> .</ span > < span class ="n "> lower</ span > < span class ="p "> ()</ span > < span class ="k "> for</ span > < span class ="n "> t</ span > < span class ="ow "> in</ span > < span class ="n "> doc</ span > < span class ="p "> ])</ span >
473+
474+
465475< span class ="k "> def</ span > < span class ="nf "> add_spacy_doc</ span > < span class ="p "> (</ span > < span class ="n "> df</ span > < span class ="p "> ,</ span > < span class ="n "> nlp</ span > < span class ="p "> ):</ span >
466476 < span class ="sd "> """</ span >
467477< span class ="sd "> Add a column with the spaCy Doc objects.</ span >
@@ -497,7 +507,7 @@ <h1>Utility Functions (Text Data Processing)<a class="headerlink" href="#utility
497507< span class ="sd "> Parameters</ span >
498508< span class ="sd "> ----------</ span >
499509< span class ="sd "> df : pandas.DataFrame</ span >
500- < span class ="sd "> The dataframe containing at least the "doc" column.</ span >
510+ < span class ="sd "> The dataframe containing at least the "doc" column (spaCy Doc objects) .</ span >
501511
502512< span class ="sd "> Returns</ span >
503513< span class ="sd "> -------</ span >
@@ -507,9 +517,10 @@ <h1>Utility Functions (Text Data Processing)<a class="headerlink" href="#utility
507517 < span class ="c1 "> # Copy the dataframe to avoid editing the original one.</ span >
508518 < span class ="n "> df</ span > < span class ="o "> =</ span > < span class ="n "> df</ span > < span class ="o "> .</ span > < span class ="n "> copy</ span > < span class ="p "> (</ span > < span class ="n "> deep</ span > < span class ="o "> =</ span > < span class ="kc "> True</ span > < span class ="p "> )</ span >
509519
510- < span class ="n "> df</ span > < span class ="p "> [</ span > < span class ="s2 "> "spacy_tokens"</ span > < span class ="p "> ]</ span > < span class ="o "> =</ span > < span class ="n "> df</ span > < span class ="p "> [</ span > < span class ="s2 "> "doc"</ span > < span class ="p "> ]</ span > < span class ="o "> .</ span > < span class ="n "> apply</ span > < span class ="p "> (</ span >
511- < span class ="k "> lambda</ span > < span class ="n "> tokens</ span > < span class ="p "> :</ span > < span class ="p "> [</ span > < span class ="n "> token</ span > < span class ="o "> .</ span > < span class ="n "> lemma_</ span > < span class ="k "> for</ span > < span class ="n "> token</ span > < span class ="ow "> in</ span > < span class ="n "> tokens</ span > < span class ="k "> if</ span > < span class ="n "> token</ span > < span class ="o "> .</ span > < span class ="n "> is_alpha</ span > < span class ="ow "> and</ span > < span class ="ow "> not</ span > < span class ="n "> token</ span > < span class ="o "> .</ span > < span class ="n "> is_stop</ span > < span class ="p "> ]</ span >
512- < span class ="p "> )</ span >
520+ < span class ="n "> df</ span > < span class ="p "> [</ span > < span class ="s2 "> "spacy_tokens"</ span > < span class ="p "> ]</ span > < span class ="o "> =</ span > < span class ="p "> [</ span >
521+ < span class ="p "> [</ span > < span class ="n "> t</ span > < span class ="o "> .</ span > < span class ="n "> lemma_</ span > < span class ="k "> for</ span > < span class ="n "> t</ span > < span class ="ow "> in</ span > < span class ="n "> doc</ span > < span class ="k "> if</ span > < span class ="n "> t</ span > < span class ="o "> .</ span > < span class ="n "> is_alpha</ span > < span class ="ow "> and</ span > < span class ="n "> t</ span > < span class ="o "> .</ span > < span class ="n "> lemma_</ span > < span class ="o "> .</ span > < span class ="n "> lower</ span > < span class ="p "> ()</ span > < span class ="ow "> not</ span > < span class ="ow "> in</ span > < span class ="n "> STOP_WORDS</ span > < span class ="p "> ]</ span >
522+ < span class ="k "> for</ span > < span class ="n "> doc</ span > < span class ="ow "> in</ span > < span class ="n "> df</ span > < span class ="p "> [</ span > < span class ="s2 "> "doc"</ span > < span class ="p "> ]</ span >
523+ < span class ="p "> ]</ span >
513524
514525 < span class ="k "> return</ span > < span class ="n "> df</ span >
515526
0 commit comments