diff --git a/.gitmodules b/.gitmodules index 81c0901..100ba25 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,6 @@ [submodule "real-time-wiki-covid-tracker"] path = real-time-wiki-covid-tracker url = https://github.com/digitalTranshumant/real-time-wiki-covid-tracker +[submodule "cdsc_reddit"] + path = reddit/cdsc_reddit + url = code:cdsc_reddit diff --git a/reddit/Makefile b/reddit/Makefile new file mode 100644 index 0000000..4373b98 --- /dev/null +++ b/reddit/Makefile @@ -0,0 +1,8 @@ +SHELL:=/bin/bash +comment_search_tasks.sh:gen_search_tasks.py + python3 gen_search_tasks.py /gscratch/comdata/output/reddit_comments_by_subreddit.parquet comment_search_tasks.sh + +submit_backfill_jobs:comment_search_tasks.sh + ./run_comment_search.sh + +PHONY:submit_backfill_jobs diff --git a/reddit/cdsc_reddit b/reddit/cdsc_reddit new file mode 160000 index 0000000..4ced659 --- /dev/null +++ b/reddit/cdsc_reddit @@ -0,0 +1 @@ +Subproject commit 4ced659d1961630c20a1ef817422f242f723af7f diff --git a/reddit/checkpoint_parallelsql.sbatch b/reddit/checkpoint_parallelsql.sbatch new file mode 100644 index 0000000..14425a4 --- /dev/null +++ b/reddit/checkpoint_parallelsql.sbatch @@ -0,0 +1,24 @@ +#!/bin/bash +## parallel_sql_job.sh +#SBATCH --job-name=find_covid_comments +## Allocation Definition +#SBATCH --account=comdata-ckpt +#SBATCH --partition=ckpt +## Resources +## Nodes. This should always be 1 for parallel-sql. +#SBATCH --nodes=1 +## Walltime (12 hours) +#SBATCH --time=12:00:00 +## Memory per node +#SBATCH --mem=32G +#SBATCH --cpus-per-task=4 +#SBATCH --ntasks=1 +#SBATCH -D /gscratch/comdata/users/nathante/COVID-19_Digital_Observatory/reddit +source ./bin/activate +module load parallel_sql +echo $(which perl) +#Put here commands to load other modules (e.g. matlab etc.) +#Below command means that parallel_sql will get tasks from the database +#and run them on the node (in parallel). So a 16 core node will have +#16 tasks running at one time. +parallel-sql --sql -a parallel --exit-on-term --jobs 4 diff --git a/reddit/comment_search_tasks.sh b/reddit/comment_search_tasks.sh new file mode 100644 index 0000000..9728a8b --- /dev/null +++ b/reddit/comment_search_tasks.sh @@ -0,0 +1,200 @@ +python3 search_comments.py part-00071-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00056-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00145-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00026-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00075-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00053-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00169-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00068-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00061-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00016-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00177-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00089-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00158-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00179-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00182-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00194-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00185-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00136-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00137-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00121-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00036-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00027-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00020-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00069-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00157-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00072-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00065-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00189-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00047-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00022-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00073-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00088-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00142-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00152-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00115-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00123-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00058-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00062-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00008-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00161-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00029-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00135-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00059-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00175-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00193-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00079-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00087-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00078-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00100-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00160-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00033-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00040-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00153-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00017-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00091-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00134-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00198-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00067-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00004-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00034-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00080-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00046-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00001-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00092-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00124-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00102-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00126-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00114-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00095-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00148-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00010-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00076-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00066-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00005-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00042-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00113-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00162-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00127-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00030-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00006-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00101-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00018-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00141-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00186-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00043-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00024-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00184-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00131-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00174-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00163-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00192-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00085-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00122-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00054-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00190-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00015-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00151-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00094-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00049-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00009-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00099-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00028-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00045-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00098-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00187-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00084-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00116-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00035-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00082-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00021-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00002-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00144-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00000-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00139-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00074-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00055-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00129-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00037-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00118-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00197-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00097-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00014-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00025-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00128-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00044-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00180-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00104-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00140-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00150-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00019-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00086-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00159-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00077-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00103-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00125-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00112-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00117-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00120-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00111-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00132-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00149-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00173-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00007-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00191-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00063-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00052-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00181-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00176-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00172-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00012-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00108-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00168-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00038-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00119-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00057-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00013-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00188-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00146-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00170-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00096-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00183-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00003-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00031-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00109-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00070-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00143-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00041-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00133-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00171-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00147-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00106-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00051-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00166-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00023-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00138-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00164-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00090-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00083-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00110-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00167-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00130-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00165-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00064-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00199-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00196-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00093-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00178-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00011-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00060-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00107-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00195-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00081-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00155-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00154-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00039-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00105-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00156-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00032-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00050-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet +python3 search_comments.py part-00048-d61007de-9cbe-4970-857f-b9fd4b35b741-c000.snappy.parquet diff --git a/reddit/concat_dataset.py b/reddit/concat_dataset.py new file mode 100644 index 0000000..1413a7b --- /dev/null +++ b/reddit/concat_dataset.py @@ -0,0 +1,10 @@ +from pyspark.sql import functions as f +from pyspark.sql import SparkSession + +spark = SparkSession.builder.getOrCreate() + +df = spark.read.parquet("filtered_comments.parquet") +df = df.repartition(1) +df = df.sortWithinPartitions(["subreddit","CreatedAt","id"]) +df.write.parquet("covid-19_reddit_comments.parquet",mode='overwrite') +df.write.json("covid-19_reddit_comments.json",mode='overwrite') diff --git a/reddit/export_comments.py b/reddit/export_comments.py new file mode 100644 index 0000000..b9b6277 --- /dev/null +++ b/reddit/export_comments.py @@ -0,0 +1,8 @@ +import pandas as pd +import pyarrow + +df = pd.read_parquet("/gscratch/comdata/users/nathante/COVID-19_Digital_Observatory/reddit/covid-19_reddit_comments.parquet") + +df.to_feather("/gscratch/comdata/users/nathante/COVID-19_Digital_Observatory/reddit/covid-19_reddit_comments_18-11-20.feather") +df.to_json("/gscratch/comdata/users/nathante/COVID-19_Digital_Observatory/reddit/covid-19_reddit_comments_18-11-20.json",orient='records',lines=True) +df.to_csv("/gscratch/comdata/users/nathante/COVID-19_Digital_Observatory/reddit/covid-19_reddit_comments_18-11-20.csv",index=False) diff --git a/reddit/gen_comment_search_tasks.py b/reddit/gen_comment_search_tasks.py new file mode 100644 index 0000000..b7e16d1 --- /dev/null +++ b/reddit/gen_comment_search_tasks.py @@ -0,0 +1,17 @@ +import fire +from pathlib import Path + + +def gen_tasks("/gscratch/comdata/output/reddit_comments_by_subreddit.parquet", outfile): + path = Path(in_parquet) + partitions = path.glob("*.parquet") + partitions = map(lambda p: p.stem, partitions) + base_task = "python3 search_comments.py {0}" + + lines = map(base_task.format, partitions) + + with open(outfile,'w') as of: + of.writelines(map(lambda l: l + '\n',lines)) + +if __name__ == "__main__": + fire.Fire(gen_tasks) diff --git a/reddit/gen_search_tasks.py b/reddit/gen_search_tasks.py new file mode 100644 index 0000000..4cbeb5e --- /dev/null +++ b/reddit/gen_search_tasks.py @@ -0,0 +1,16 @@ +import fire +from pathlib import Path + +def gen_tasks(in_parquet="/gscratch/comdata/output/reddit_comments_by_subreddit.parquet", outfile="comment_search_tasks.sh"): + path = Path(in_parquet) + partitions = path.glob("*.parquet") + partitions = map(lambda p: p.parts[-1], partitions) + base_task = "python3 search_comments.py {0}" + + lines = map(base_task.format, partitions) + + with open(outfile,'w') as of: + of.writelines(map(lambda l: l + '\n',lines)) + +if __name__ == "__main__": + fire.Fire(gen_tasks) diff --git a/reddit/run_comment_search.sh b/reddit/run_comment_search.sh new file mode 100755 index 0000000..e1efe0e --- /dev/null +++ b/reddit/run_comment_search.sh @@ -0,0 +1,7 @@ +#!/bin/bash +module load parallel_sql +source ./bin/activate +psu --del --Y +cat comment_search_tasks.sh | psu --load + +for job in $(seq 1 50); do sbatch checkpoint_parallelsql.sbatch; done; diff --git a/reddit/search_comments.py b/reddit/search_comments.py new file mode 100644 index 0000000..62b4a91 --- /dev/null +++ b/reddit/search_comments.py @@ -0,0 +1,55 @@ +import pyarrow as pa +import pyarrow.dataset as ds +import pyarrow.parquet as pq +import pandas as pd +import fire +import ahocorasick +from datetime import datetime, timedelta +from pathlib import Path + +PARQUET_PATH = Path('/gscratch/comdata/output/reddit_comments_by_subreddit.parquet') +OUTPUT_PATH = Path('/gscratch/comdata/users/nathante/COVID-19_Digital_Observatory/reddit/filtered_comments.parquet') +def load_keywords(keywords): + keywords = pd.read_csv(keywords) + + keywords = set(keywords.label) + keywords = map(str.lower, keywords) + trie = ahocorasick.Automaton() + list(map(lambda s: trie.add_word(s,s),keywords)) + trie.make_automaton() + return(trie) + +# use the aho corasick algorithm to do the string matching +def match_comment_kwlist(body,trie,min_length=5): + stems = trie.iter(body.lower()) + stems = map(lambda s: s[1], stems) + stems = filter(lambda s: len(s) >= 5, stems) + return list(stems) + +def filter_comments(partition, keywords="../keywords/output/csv/2020-10-19_wikidata_item_labels.csv", from_date = datetime(2019,10,1)): + + if partition is None: + partition_path = next(PARQUET_PATH.iterdir()) + partition = partition_path.stem + else: + partition_path = PARQUET_PATH / partition + + trie = load_keywords(keywords) + + pq_dataset = ds.dataset(partition_path) + + batches = pq_dataset.to_batches(filter=(ds.field("CreatedAt")>=from_date)) + + for batch in batches: + df = batch.to_pandas() + if df.shape[0] > 0: + matches = df.body.apply(lambda b: match_comment_kwlist(b, trie, min_length=5)) + has_match = matches.apply(lambda l: len(l) > 0) + df = df.loc[has_match] + df['keyword_matches'] = matches[has_match] + if df.shape[0] > 0: + df.to_parquet(OUTPUT_PATH / f'{partition}',index=False,engine='pyarrow',flavor='spark') + + +if __name__ == "__main__": + fire.Fire(filter_comments) diff --git a/reddit/search_dumps.py b/reddit/search_dumps.py new file mode 100644 index 0000000..94dbf6d --- /dev/null +++ b/reddit/search_dumps.py @@ -0,0 +1,2 @@ +import pyarrow +