Merge pull request #5 from gbloisi-openaire/PR644

schatzopoulos · web-flow · commit f1aa2ba245c4 · 2026-02-16T19:22:13.000+02:00
Changes to run on cloud environments
diff --git a/AttRank.py b/AttRank.py
@@ -61,7 +61,9 @@
 # Set the mode by default as local. 
 # If data is read from hdfs we switch to cluster
 mode = 'local'
-if input_file.startswith('hdfs://'):
+# Detect execution mode from input path.
+# Any URI-scheme path (hdfs://, s3a://, gs://, etc.) is treated as distributed,
+if '://' in input_file:
 	mode = 'distributed'
 ############################################
 print("Mode is: " + mode)
@@ -114,7 +116,7 @@
 	input_data = spark.read.schema(graph_file_schema).option('delimiter', '\t').csv(input_file)
 elif mode == 'distributed':
 	print ("Reading input from HDFS...")
-	input_data = spark.read.schema(graph_file_schema).option('delimiter', '\t').csv(input_file).repartition(num_partitions, 'paper')
+	input_data = spark.read.schema(graph_file_schema).option('delimiter', '\t').csv(input_file).repartition('paper')
 		
 # Get number of nodes (one node-record per line)
 num_nodes = float(input_data.count())
@@ -134,7 +136,7 @@
 print ("Current Year: ", current_year)
 print ("Convergence Error: ", max_error)
 print ("Number of nodes: ", num_nodes)
-print ("Number of partitions: ", num_partitions) 
+#print ("Number of partitions: ", num_partitions)
 print ("Checkpoint mode: " + str(checkpoint_mode))
 print ("Checkpoint dir: " + checkpoint_dir)
 print ("# ------------------------------------ #\n")
@@ -163,17 +165,17 @@
 		     .select('paper', 'cited_papers', F.expr('size(cited_papers)-2').alias('cited_paper_size'), 'pub_year')\
 		     .select('paper', F.expr('slice(cited_papers, 1, cited_paper_size)').alias('cited_papers'), 'pub_year')\
 		     .select('paper', F.array_join('cited_papers', '|').alias('cited_papers'), 'pub_year')\
-		     .select('paper', F.split('cited_papers', ',').alias('cited_papers'), 'pub_year').repartition(num_partitions, 'paper').cache()
+		     .select('paper', F.split('cited_papers', ',').alias('cited_papers'), 'pub_year').repartition('paper').cache()
 
 # Create a DataFrame with nodes filtered based on whether they cite others or not
-outlinks_actual = outlinks.filter(outlinks['cited_papers'][0] != '0').repartition(num_partitions, 'paper').cache()
+outlinks_actual = outlinks.filter(outlinks['cited_papers'][0] != '0').repartition('paper').cache()
 
 # Continue intialisation message
 print(".", end = '')
 sys.stdout.flush()
 
 # Collect the dangling nodes from the data - cache it since it will be reused
-dangling_nodes = outlinks.filter(outlinks.cited_papers[0] == '0').select('paper').repartition(num_partitions, 'paper').cache()
+dangling_nodes = outlinks.filter(outlinks.cited_papers[0] == '0').select('paper').repartition('paper').cache()
 
 
 # Continue intialisation message
@@ -184,12 +186,12 @@
 # --> Create a DataFrame with the time-based exponential scores <--
 # 1. Get paper-publication year pairs.
 paper_years = input_data.select('paper', F.col('pub_year').alias('year')).withColumn('year_fixed', F.when( (F.col('year').cast(IntegerType()) < 1000) | (F.col('year').cast(IntegerType()) > int(current_year)) | (F.col('year') == "\\N"), 0).otherwise(F.col('year')))
-paper_years = paper_years.select('paper', F.col('year_fixed').alias('year')).repartition(num_partitions, 'paper').cache()
+paper_years = paper_years.select('paper', F.col('year_fixed').alias('year')).repartition('paper').cache()
 # 2. Get paper-exponential score-based pairs
 paper_exp = paper_years.withColumn('exp_score', F.lit(F.exp(exponential * (current_year+1-paper_years.year) )) ).drop('year')
 # 3. Normalize exponential scores so they add to one
 exp_score_sum = paper_exp.agg({'exp_score':'sum'}).collect()[0][0]
-paper_exp     = paper_exp.select('paper', (paper_exp.exp_score/float(exp_score_sum)).alias('exp_score')).repartition(num_partitions, 'paper').cache()
+paper_exp     = paper_exp.select('paper', (paper_exp.exp_score/float(exp_score_sum)).alias('exp_score')).repartition('paper').cache()
 
 # Continue Initialisation message
 print(".", end = '')
@@ -212,13 +214,13 @@
 # 2. Get total number of citations made in the specified year range
 total_citations_in_range = paper_citations.agg({'citations_in_range':'sum'}).collect()[0][0]
 # 3. Calculate preferential attachment probabilities - cache them since they will be reused
-paper_attention = paper_citations.select('paper', (F.col('citations_in_range') / total_citations_in_range).alias('attention')).repartition(num_partitions, 'paper').cache()
+paper_attention = paper_citations.select('paper', (F.col('citations_in_range') / total_citations_in_range).alias('attention')).repartition('paper').cache()
 # Continue Initialisation message
 print(".", end = '')
 sys.stdout.flush()
 ###########################################################
 # --> Get paper exponential scores and attention scores in a single 'materialized' table <--
-vector_scores = paper_attention.join(paper_exp, 'paper', 'right_outer').fillna(0.0, ['attention']).repartition(num_partitions, 'paper').cache()
+vector_scores = paper_attention.join(paper_exp, 'paper', 'right_outer').fillna(0.0, ['attention']).repartition('paper').cache()
 # vector_scores.count()
 # Continue Initialisation message
 print(".", end = '')
@@ -264,7 +266,7 @@
 				.groupBy('paper')\
 				.agg(F.sum('transferred_score').alias('transferred_score_sum'))\
 				.join(vector_scores, 'paper', 'right_outer')\
-				.repartition(num_partitions, 'paper')\
+				.repartition('paper')\
 				.fillna(0.0, ['transferred_score_sum'])\
 				.select('paper', (alpha*(F.col('transferred_score_sum')+dangling_sum) + beta*F.col('attention') + gamma*F.col('exp_score') ).alias('score'))\
 				.join(previous_scores, 'paper')\
@@ -310,7 +312,7 @@
 print ("\n# ------------------------------------ #\n")
 print("Finished score calculations. Preparing classes and normalized scores!")
 
-scores		= scores.repartition(num_partitions, 'paper').cache()
+scores		= scores.repartition('paper').cache()
 max_score 	= scores.agg({'score': 'max'}).collect()[0]['max(score)']
 
 # Define the top ranges in number of papers
@@ -322,15 +324,15 @@
 # ------------------------------------------------------------------------------------------------------ #
 # This code is included for small testing datasets. The percentages required may be < 1 for small datasets
 top_001_offset = 1 if top_001_offset <= 1 else top_001_offset
-top_01_offset = 1 if top_001_offset <= 1 else top_01_offset
+top_01_offset = 1 if top_01_offset <= 1 else top_01_offset
 top_1_offset = 1 if top_1_offset <= 1 else top_1_offset
 top_10_offset = 1 if top_10_offset <= 1 else top_10_offset
 # top_20_offset = 1 if top_20_offset <= 1 else top_20_offset
 # ------------------------------------------------------------------------------------------------------ #
 # Time calculations
 start_time = time.time()
 # Calculate a running count window of scores, in order to filter out papers w/ scores lower than that of the top 20%
-distinct_scores = scores.select(F.col('score')).repartition(num_partitions, 'score').groupBy('score').count()\
+distinct_scores = scores.select(F.col('score')).repartition('score').groupBy('score').count()\
 				 .withColumn('cumulative', F.sum('count').over(Window.orderBy(F.col('score').desc())))
 distinct_scores_count = distinct_scores.count()
 print ("Calculated distinct scores num (" + str(distinct_scores_count) + "), time: {} seconds ---".format(time.time() - start_time))
diff --git a/CC.py b/CC.py
@@ -59,7 +59,10 @@
 # Set the mode by default as local. 
 # If data is read from hdfs we switch to cluster
 mode = 'local'
-if input_file.startswith('hdfs://'):
+
+# Detect execution mode from input path.
+# Any URI-scheme path (hdfs://, s3a://, gs://, etc.) is treated as distributed,
+if '://' in input_file:
 	mode = 'distributed'
 #####################################################################################################
 # Create spark session & context - these are entry points to spark
@@ -96,13 +99,13 @@
 elif mode == 'distributed':
 	print("\n\nReading input from hdfs\n\n")
 	# Use spark session with schema instead of spark context and text file (this should spead up reading the file)
-	input_data = spark.read.schema(graph_file_schema).option('delimiter', '\t').csv(input_file).repartition(num_partitions, "paper")	
+	input_data = spark.read.schema(graph_file_schema).option('delimiter', '\t').csv(input_file).repartition("paper")
 #####################################################################################################	
 # Time initialization
 initialisation_time = time.time()
 # Print out info messages about the program's parameters
 print ("Mode is: " + mode)
-print ("Num Partitions: " + str(num_partitions))
+#print ("Num Partitions: " + str(num_partitions))
 print ("Limit year: " + str(limit_year))
 print ("\n\n")
 # Initialise SPARK Data
@@ -113,51 +116,51 @@
 		     .select('paper', 'cited_papers', F.expr('size(cited_papers)-2').alias("cited_paper_size"), 'pub_year')\
 		     .select('paper', F.expr("slice(cited_papers, 1, cited_paper_size)").alias('cited_papers'), 'pub_year')\
 		     .select('paper', F.array_join('cited_papers', '|').alias('cited_papers'), 'pub_year')\
-		     .select('paper', F.split('cited_papers', ',').alias('cited_papers'), 'pub_year').repartition(num_partitions, 'pub_year').cache()
+		     .select('paper', F.split('cited_papers', ',').alias('cited_papers'), 'pub_year').repartition('pub_year').cache()
 
 # Create a dataframe with nodes filtered based on whether they cite others or not. Here we keep those that make citations (i.e., remove dangling nodes)
 print ("Planning removal of dangling nodes...")
 outlinks_actual = outlinks.filter(outlinks['cited_papers'][0] != '0')\
-			  .select('paper', F.explode(F.col('cited_papers')).alias('cited_paper') , F.col('pub_year')).repartition(num_partitions, "paper").cache()
+			  .select('paper', F.explode(F.col('cited_papers')).alias('cited_paper') , F.col('pub_year')).repartition("paper").cache()
 
 # If offset year is given, we need to perform some filtering of citations based on pub year. Proceed by normally calculating the 3-year based CC
 if  limit_year:
 	# We now need to filter out those records where citing year - cited year >  limit_year
 	# a. join again with years, based on cited paper year - create a clone of the initial dataframe, because otherwise there will be an error due to similar column names
 	print ("Gathering years of cited papers...")
-	cited_paper_years = outlinks.select('paper', F.col('pub_year').alias('cited_paper_year')).withColumnRenamed('paper', 'cited_paper').repartition(num_partitions, 'cited_paper')
+	cited_paper_years = outlinks.select('paper', F.col('pub_year').alias('cited_paper_year')).withColumnRenamed('paper', 'cited_paper').repartition('cited_paper')
 	# Since here outlinks_actual is joined on cited paper, we need to repartition it
-	valid_citations   = outlinks_actual.repartition(num_partitions, 'cited_paper').join(cited_paper_years, outlinks_actual.cited_paper == cited_paper_years.cited_paper)\
+	valid_citations   = outlinks_actual.repartition('cited_paper').join(cited_paper_years, outlinks_actual.cited_paper == cited_paper_years.cited_paper)\
 			    .select(outlinks_actual.paper, 
 				    cited_paper_years.cited_paper, 
 				    outlinks_actual.pub_year.alias('citing_paper_year'), 
 				    cited_paper_years.cited_paper_year)\
-			    .repartition(num_partitions, 'paper')	
+			    .repartition('paper')
 					 
 	# b. Filter out those where citing paper year > cited paper year + 3
 	print ("Filtering out citations based on pub year difference...")
-	valid_citations = valid_citations.filter(valid_citations['citing_paper_year']-valid_citations['cited_paper_year'] <=  limit_year).repartition(num_partitions, 'paper').cache()
+	valid_citations = valid_citations.filter(valid_citations['citing_paper_year']-valid_citations['cited_paper_year'] <=  limit_year).repartition('paper').cache()
 # Do nothing if no limit year was specified. For uniformity reasons we set the valid citations variable to point to outlinks_actual
 else:
 	valid_citations = outlinks_actual
 
 # Group by cited_paper and get counts
 print("Preparing count of citations...")
-valid_citations = valid_citations.repartition(num_partitions, 'cited_paper').groupBy('cited_paper').count().repartition(num_partitions, 'cited_paper')
+valid_citations = valid_citations.repartition('cited_paper').groupBy('cited_paper').count().repartition('cited_paper')
 
 # Add papers which aren't cited
 print("Planning addition of dangling nodes...")
 # Join with papers that aren't cited
 valid_citations = valid_citations.join(outlinks.select('paper'), outlinks.paper == valid_citations.cited_paper, 'right_outer')\
 				.select('paper', 'count')\
-				.fillna(0).repartition(num_partitions, 'paper').cache()
+				.fillna(0).repartition('paper').cache()
 
 print ("\n# ------------------------------------ #\n")
 print("Finished planning calculations. Proceeding to calculation of scores and classes...\n")
 
 # Time it
 start_time = time.time()
-max_score  = valid_citations.select('count').repartition(num_partitions).distinct().agg({'count': 'max'}).collect()[0]['max(count)']
+max_score  = valid_citations.agg(F.max('count')).collect()[0]['max(count)']
 print ("Got max score:" + str(max_score) + " - Took {} seconds".format(time.time() - start_time) + " to get here from initial file read (this is the first transformation)")
  
 # Time it
@@ -174,7 +177,7 @@
 # ------------------------------------------------------------------------------------------------------ #
 # This code is included for small testing datasets. The percentages required may be < 1 for small datasets
 top_001_offset = 1 if top_001_offset <= 1 else top_001_offset
-top_01_offset = 1 if top_001_offset <= 1 else top_01_offset
+top_01_offset = 1 if top_01_offset <= 1 else top_01_offset
 top_1_offset = 1 if top_1_offset <= 1 else top_1_offset
 top_10_offset = 1 if top_10_offset <= 1 else top_10_offset
 # top_20_offset = 1 if top_20_offset <= 1 else top_20_offset
@@ -183,7 +186,7 @@
 # Time it
 start_time = time.time()
 # Calculate a running count window of scores, in order to filter out papers w/ scores lower than that of the top 20%
-distinct_scores = valid_citations.select(F.col('count').alias('cc')).repartition(num_partitions, 'cc').groupBy('cc').count()\
+distinct_scores = valid_citations.select(F.col('count').alias('cc')).repartition('cc').groupBy('cc').count()\
 				 .withColumn('cumulative', F.sum('count').over(Window.orderBy(F.col('cc').desc())))
 distinct_scores_count = distinct_scores.count()
 print ("Calculated distinct scores num (" + str(distinct_scores_count) + "), time: {} seconds ---".format(time.time() - start_time))
@@ -235,7 +238,7 @@
 		.withColumn('normalized_' + column_name, F.lit(F.col(column_name)/float(max_score)))\
 		.withColumn('three_point_class', F.lit('C'))
 valid_citations = valid_citations.withColumn('three_point_class', F.when(F.col(column_name) >= top_1_score, F.lit('B')).otherwise(F.col('three_point_class')) )
-valid_citations = valid_citations.withColumn('three_point_class', F.when(F.col(column_name) >= top_001_score, F.lit('A')).otherwise(F.col('three_point_class')) )	
+valid_citations = valid_citations.withColumn('three_point_class', F.when(F.col(column_name) >= top_001_score, F.lit('A')).otherwise(F.col('three_point_class')) )
 valid_citations = valid_citations.select(F.regexp_replace('paper', 'comma_char', ',').alias('doi'), column_name, 'normalized_' + column_name, 'three_point_class')
 
 # Add six point class to score dataframe
diff --git a/PageRank.py b/PageRank.py
@@ -57,7 +57,9 @@
 # Set the mode by default as local. 
 # If data is read from hdfs we switch to cluster
 mode = 'local'
-if input_file.startswith('hdfs://'):
+# Detect execution mode from input path.
+# Any URI-scheme path (hdfs://, s3a://, gs://, etc.) is treated as distributed,
+if '://' in input_file:
 	mode = 'distributed'
 	
 # Set the mode by default as dfs. 
@@ -116,7 +118,7 @@
 	input_data = spark.read.schema(graph_file_schema).option('delimiter', '\t').csv(input_file).cache()
 elif mode == 'distributed':
 	print ("Reading input from HDFS...")
-	input_data = spark.read.schema(graph_file_schema).option('delimiter', '\t').csv(input_file).repartition(num_partitions, 'paper').cache()
+	input_data = spark.read.schema(graph_file_schema).option('delimiter', '\t').csv(input_file).repartition('paper').cache()
 	
 	
 # Get number of nodes (one node-record per line)
@@ -161,17 +163,17 @@
 		     .select('paper', 'cited_papers', F.expr('size(cited_papers)-2').alias('cited_paper_size'), 'pub_year')\
 		     .select('paper', F.expr('slice(cited_papers, 1, cited_paper_size)').alias('cited_papers'), 'pub_year')\
 		     .select('paper', F.array_join('cited_papers', '|').alias('cited_papers'), 'pub_year')\
-		     .select('paper', F.split('cited_papers', ',').alias('cited_papers'), 'pub_year').repartition(num_partitions, 'paper').cache()
+		     .select('paper', F.split('cited_papers', ',').alias('cited_papers'), 'pub_year').repartition('paper').cache()
 
 # Create a DataFrame with nodes filtered based on whether they cite others or not
-outlinks_actual = outlinks.filter(outlinks['cited_papers'][0] != '0').repartition(num_partitions, 'paper').cache()
+outlinks_actual = outlinks.filter(outlinks['cited_papers'][0] != '0').repartition('paper').cache()
 
 # Continue intialisation message
 print(".", end = '')
 sys.stdout.flush()
 
 # Collect the dangling nodes from the data - cache it since it will be reused
-dangling_nodes = outlinks.filter(outlinks.cited_papers[0] == '0').select('paper').repartition(num_partitions, 'paper').cache()
+dangling_nodes = outlinks.filter(outlinks.cited_papers[0] == '0').select('paper').repartition('paper').cache()
 
 # Continue intialisation message
 print(".", end = '')
@@ -228,7 +230,7 @@
 				.agg(F.sum('transferred_score').alias('transferred_score_sum'))\
 				.select('paper', (alpha * (F.col('transferred_score_sum')+dangling_sum) + (1-alpha)*random_jump_prob).alias('score'))\
 				.join(previous_scores, 'paper', 'right_outer')\
-				.repartition(num_partitions, 'paper')\
+				.repartition('paper')\
 				.fillna(uncited_node_score, ['score'])\
 				.withColumn('score_diff', F.abs( F.col('score') - F.col('previous_score') ) )
 	# We should keep the newly calculated scores in memory for further use.
@@ -257,7 +259,7 @@
 
 	# -------------------------------- #
 	# 3. Calculate max error
-	error = scores.select('score_diff').distinct().agg({'score_diff': 'max'}).collect()[0][0] 
+	error = scores.agg(F.max('score_diff')).collect()[0][0]
 
 	# -------------------------------- #
 	# 4. Do required re-initialisations and update variables
@@ -275,7 +277,7 @@
 print ("\n# ------------------------------------ #\n")
 print("Finished score calculations. Preparing classes and normalized scores!")
 
-scores		= scores.repartition(num_partitions, 'paper').cache()
+scores		= scores.repartition('paper').cache()
 max_score 	= scores.agg({'score': 'max'}).collect()[0]['max(score)']
 
 # Define the top ranges in number of papers
@@ -288,15 +290,15 @@
 # ------------------------------------------------------------------------------------------------------ #
 # This code is included for small testing datasets. The percentages required may be < 1 for small datasets
 top_001_offset = 1 if top_001_offset <= 1 else top_001_offset
-top_01_offset = 1 if top_001_offset <= 1 else top_01_offset
+top_01_offset = 1 if top_01_offset <= 1 else top_01_offset
 top_1_offset = 1 if top_1_offset <= 1 else top_1_offset
 top_10_offset = 1 if top_10_offset <= 1 else top_10_offset
 # top_20_offset = 1 if top_20_offset <= 1 else top_20_offset
 # ------------------------------------------------------------------------------------------------------ #
 # Time calculations
 start_time = time.time()
 # Calculate a running count window of scores, in order to filter out papers w/ scores lower than that of the top 20%
-distinct_scores = scores.select(F.col('score')).repartition(num_partitions, 'score').groupBy('score').count()\
+distinct_scores = scores.select(F.col('score')).repartition('score').groupBy('score').count()\
 				 .withColumn('cumulative', F.sum('count').over(Window.orderBy(F.col('score').desc())))
 distinct_scores_count = distinct_scores.count()
 print ("Calculated distinct scores num (" + str(distinct_scores_count) + "), time: {} seconds ---".format(time.time() - start_time))
diff --git a/TAR.py b/TAR.py