Skip to content

Commit d42568e

Browse files
committed
fixed sorting parameter -> descending=True
1 parent ee4c4e5 commit d42568e

File tree

2 files changed

+21
-16
lines changed

2 files changed

+21
-16
lines changed

main.py

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,12 @@
99
Github: https://github.com/rNLKJA/2023-S1-COMP90024-A1/
1010
1111
"""
12+
from scripts.mpi import gather_task_tdf, get_task_ranks
13+
from scripts.utils import *
14+
from scripts.twitter_processor import *
15+
from scripts.sal_processor import process_salV1
16+
from scripts.logger import twitter_logger as logger
17+
from scripts.arg_parser import parser
1218
import sys
1319
import time
1420
import os
@@ -18,12 +24,6 @@
1824

1925
sys.path.append("./scripts")
2026

21-
from scripts.arg_parser import parser
22-
from scripts.logger import twitter_logger as logger
23-
from scripts.sal_processor import process_salV1
24-
from scripts.twitter_processor import *
25-
from scripts.utils import *
26-
from scripts.mpi import gather_task_tdf, get_task_ranks
2727

2828
os.environ["NUMEXPR_MAX_THREADS"] = "32"
2929
PATH = Path()
@@ -54,17 +54,17 @@
5454
twitter_file, chunk_start[rank], chunk_end[rank], sal_dict
5555
)
5656

57-
logger.info(f"Rank {rank}: File Read Completed, cost: {time.time()- start_time}")
57+
logger.info(
58+
f"Rank {rank}: File Read Completed, cost: {time.time()- start_time}")
5859

5960
# process twitter data based on three task requirements
6061
t1_tdf = count_number_of_tweets_by_author(tdf)
6162
t2_tdf = count_number_of_tweets_by_gcc(tdf)
6263
t3_tdf = count_author_tweets_from_most_different_gcc(tdf)
63-
6464
# =================================== TASK 1 ===================================
6565
t1_tdfs = gather_task_tdf(rank, task1_rank, size, t1_tdf, comm)
66-
67-
if rank == task1_rank:
66+
67+
if rank == task1_rank:
6868
return_twitter_counts_by_author_id(t1_tdfs, path=PATH)
6969
# =================================== TASK 2 ===================================
7070
t2_tdfs = gather_task_tdf(rank, task2_rank, size, t2_tdf, comm)

scripts/twitter_processor.py

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -291,7 +291,8 @@ def generate_polars_dataframe(
291291
)
292292

293293
tweet_df1 = tweet_df.with_columns(
294-
pl.col("location").apply(lambda x: normalise_location(x), skip_nulls=True)
294+
pl.col("location").apply(
295+
lambda x: normalise_location(x), skip_nulls=True)
295296
)
296297
tweet_df1 = tweet_df1.join(sal_df, on="location", how="left")
297298

@@ -362,7 +363,7 @@ def count_number_of_tweets_by_author(tdf: pl.DataFrame) -> pl.DataFrame:
362363
tdf.select("author_id", "tweet_id")
363364
.groupby("author_id")
364365
.agg(pl.count("tweet_id").alias("tweet_count"))
365-
.sort("tweet_count", reverse=True)
366+
.sort("tweet_count", descending=True)
366367
)
367368

368369
return author_tweet_count
@@ -466,7 +467,8 @@ def generate_task_3_result(tdf: pl.DataFrame, save: bool, path: Path) -> pl.Data
466467
)
467468

468469
tdf1 = tdf1.with_columns(
469-
pl.col("gcc_count").rank(method="ordinal", descending=True).alias("rank")
470+
pl.col("gcc_count").rank(method="ordinal",
471+
descending=True).alias("rank")
470472
)
471473
tdf1 = tdf1.filter(pl.col("rank") < 11)
472474

@@ -498,10 +500,12 @@ def generate_task_3_result(tdf: pl.DataFrame, save: bool, path: Path) -> pl.Data
498500
).alias("gtc")
499501
]
500502
).select("rank", "author_id", "gtc")
501-
tdf4.columns = ["Rank", "Author Id", "Number of Unique City Locations and #Tweets"]
503+
tdf4.columns = ["Rank", "Author Id",
504+
"Number of Unique City Locations and #Tweets"]
502505

503506
if save:
504-
tdf4.sort("Rank", descending=False).write_csv(path / "data/result/task3.csv")
507+
tdf4.sort("Rank", descending=False).write_csv(
508+
path / "data/result/task3.csv")
505509
return
506510
return tdf4
507511

@@ -537,7 +541,8 @@ def concate_count_dict_with_rank_df(count_dict: dict) -> pl.DataFrame:
537541
"""
538542
strings = []
539543
for key in count_dict.keys():
540-
strings.append(", ".join([f"#{v}{k[1:]}" for k, v in count_dict[key].items()]))
544+
strings.append(
545+
", ".join([f"#{v}{k[1:]}" for k, v in count_dict[key].items()]))
541546

542547
return pl.DataFrame({"author_id": count_dict.keys(), "nugt": strings})
543548

0 commit comments

Comments
 (0)