|
59 | 59 | # Set the mode by default as local. |
60 | 60 | # If data is read from hdfs we switch to cluster |
61 | 61 | mode = 'local' |
62 | | -if input_file.startswith('hdfs://'): |
| 62 | + |
| 63 | +# Detect execution mode from input path. |
| 64 | +# Any URI-scheme path (hdfs://, s3a://, gs://, etc.) is treated as distributed, |
| 65 | +if '://' in input_file: |
63 | 66 | mode = 'distributed' |
64 | 67 | ##################################################################################################### |
65 | 68 | # Create spark session & context - these are entry points to spark |
|
96 | 99 | elif mode == 'distributed': |
97 | 100 | print("\n\nReading input from hdfs\n\n") |
98 | 101 | # Use spark session with schema instead of spark context and text file (this should spead up reading the file) |
99 | | - input_data = spark.read.schema(graph_file_schema).option('delimiter', '\t').csv(input_file).repartition(num_partitions, "paper") |
| 102 | + input_data = spark.read.schema(graph_file_schema).option('delimiter', '\t').csv(input_file).repartition("paper") |
100 | 103 | ##################################################################################################### |
101 | 104 | # Time initialization |
102 | 105 | initialisation_time = time.time() |
103 | 106 | # Print out info messages about the program's parameters |
104 | 107 | print ("Mode is: " + mode) |
105 | | -print ("Num Partitions: " + str(num_partitions)) |
| 108 | +#print ("Num Partitions: " + str(num_partitions)) |
106 | 109 | print ("Limit year: " + str(limit_year)) |
107 | 110 | print ("\n\n") |
108 | 111 | # Initialise SPARK Data |
|
113 | 116 | .select('paper', 'cited_papers', F.expr('size(cited_papers)-2').alias("cited_paper_size"), 'pub_year')\ |
114 | 117 | .select('paper', F.expr("slice(cited_papers, 1, cited_paper_size)").alias('cited_papers'), 'pub_year')\ |
115 | 118 | .select('paper', F.array_join('cited_papers', '|').alias('cited_papers'), 'pub_year')\ |
116 | | - .select('paper', F.split('cited_papers', ',').alias('cited_papers'), 'pub_year').repartition(num_partitions, 'pub_year').cache() |
| 119 | + .select('paper', F.split('cited_papers', ',').alias('cited_papers'), 'pub_year').repartition('pub_year').cache() |
117 | 120 |
|
118 | 121 | # Create a dataframe with nodes filtered based on whether they cite others or not. Here we keep those that make citations (i.e., remove dangling nodes) |
119 | 122 | print ("Planning removal of dangling nodes...") |
120 | 123 | outlinks_actual = outlinks.filter(outlinks['cited_papers'][0] != '0')\ |
121 | | - .select('paper', F.explode(F.col('cited_papers')).alias('cited_paper') , F.col('pub_year')).repartition(num_partitions, "paper").cache() |
| 124 | + .select('paper', F.explode(F.col('cited_papers')).alias('cited_paper') , F.col('pub_year')).repartition("paper").cache() |
122 | 125 |
|
123 | 126 | # If offset year is given, we need to perform some filtering of citations based on pub year. Proceed by normally calculating the 3-year based CC |
124 | 127 | if limit_year: |
125 | 128 | # We now need to filter out those records where citing year - cited year > limit_year |
126 | 129 | # a. join again with years, based on cited paper year - create a clone of the initial dataframe, because otherwise there will be an error due to similar column names |
127 | 130 | print ("Gathering years of cited papers...") |
128 | | - cited_paper_years = outlinks.select('paper', F.col('pub_year').alias('cited_paper_year')).withColumnRenamed('paper', 'cited_paper').repartition(num_partitions, 'cited_paper') |
| 131 | + cited_paper_years = outlinks.select('paper', F.col('pub_year').alias('cited_paper_year')).withColumnRenamed('paper', 'cited_paper').repartition('cited_paper') |
129 | 132 | # Since here outlinks_actual is joined on cited paper, we need to repartition it |
130 | | - valid_citations = outlinks_actual.repartition(num_partitions, 'cited_paper').join(cited_paper_years, outlinks_actual.cited_paper == cited_paper_years.cited_paper)\ |
| 133 | + valid_citations = outlinks_actual.repartition('cited_paper').join(cited_paper_years, outlinks_actual.cited_paper == cited_paper_years.cited_paper)\ |
131 | 134 | .select(outlinks_actual.paper, |
132 | 135 | cited_paper_years.cited_paper, |
133 | 136 | outlinks_actual.pub_year.alias('citing_paper_year'), |
134 | 137 | cited_paper_years.cited_paper_year)\ |
135 | | - .repartition(num_partitions, 'paper') |
| 138 | + .repartition('paper') |
136 | 139 |
|
137 | 140 | # b. Filter out those where citing paper year > cited paper year + 3 |
138 | 141 | print ("Filtering out citations based on pub year difference...") |
139 | | - valid_citations = valid_citations.filter(valid_citations['citing_paper_year']-valid_citations['cited_paper_year'] <= limit_year).repartition(num_partitions, 'paper').cache() |
| 142 | + valid_citations = valid_citations.filter(valid_citations['citing_paper_year']-valid_citations['cited_paper_year'] <= limit_year).repartition('paper').cache() |
140 | 143 | # Do nothing if no limit year was specified. For uniformity reasons we set the valid citations variable to point to outlinks_actual |
141 | 144 | else: |
142 | 145 | valid_citations = outlinks_actual |
143 | 146 |
|
144 | 147 | # Group by cited_paper and get counts |
145 | 148 | print("Preparing count of citations...") |
146 | | -valid_citations = valid_citations.repartition(num_partitions, 'cited_paper').groupBy('cited_paper').count().repartition(num_partitions, 'cited_paper') |
| 149 | +valid_citations = valid_citations.repartition('cited_paper').groupBy('cited_paper').count().repartition('cited_paper') |
147 | 150 |
|
148 | 151 | # Add papers which aren't cited |
149 | 152 | print("Planning addition of dangling nodes...") |
150 | 153 | # Join with papers that aren't cited |
151 | 154 | valid_citations = valid_citations.join(outlinks.select('paper'), outlinks.paper == valid_citations.cited_paper, 'right_outer')\ |
152 | 155 | .select('paper', 'count')\ |
153 | | - .fillna(0).repartition(num_partitions, 'paper').cache() |
| 156 | + .fillna(0).repartition('paper').cache() |
154 | 157 |
|
155 | 158 | print ("\n# ------------------------------------ #\n") |
156 | 159 | print("Finished planning calculations. Proceeding to calculation of scores and classes...\n") |
157 | 160 |
|
158 | 161 | # Time it |
159 | 162 | start_time = time.time() |
160 | | -max_score = valid_citations.select('count').repartition(num_partitions).distinct().agg({'count': 'max'}).collect()[0]['max(count)'] |
| 163 | +max_score = valid_citations.agg(F.max('count')).collect()[0]['max(count)'] |
161 | 164 | print ("Got max score:" + str(max_score) + " - Took {} seconds".format(time.time() - start_time) + " to get here from initial file read (this is the first transformation)") |
162 | 165 |
|
163 | 166 | # Time it |
|
174 | 177 | # ------------------------------------------------------------------------------------------------------ # |
175 | 178 | # This code is included for small testing datasets. The percentages required may be < 1 for small datasets |
176 | 179 | top_001_offset = 1 if top_001_offset <= 1 else top_001_offset |
177 | | -top_01_offset = 1 if top_001_offset <= 1 else top_01_offset |
| 180 | +top_01_offset = 1 if top_01_offset <= 1 else top_01_offset |
178 | 181 | top_1_offset = 1 if top_1_offset <= 1 else top_1_offset |
179 | 182 | top_10_offset = 1 if top_10_offset <= 1 else top_10_offset |
180 | 183 | # top_20_offset = 1 if top_20_offset <= 1 else top_20_offset |
|
183 | 186 | # Time it |
184 | 187 | start_time = time.time() |
185 | 188 | # Calculate a running count window of scores, in order to filter out papers w/ scores lower than that of the top 20% |
186 | | -distinct_scores = valid_citations.select(F.col('count').alias('cc')).repartition(num_partitions, 'cc').groupBy('cc').count()\ |
| 189 | +distinct_scores = valid_citations.select(F.col('count').alias('cc')).repartition('cc').groupBy('cc').count()\ |
187 | 190 | .withColumn('cumulative', F.sum('count').over(Window.orderBy(F.col('cc').desc()))) |
188 | 191 | distinct_scores_count = distinct_scores.count() |
189 | 192 | print ("Calculated distinct scores num (" + str(distinct_scores_count) + "), time: {} seconds ---".format(time.time() - start_time)) |
|
235 | 238 | .withColumn('normalized_' + column_name, F.lit(F.col(column_name)/float(max_score)))\ |
236 | 239 | .withColumn('three_point_class', F.lit('C')) |
237 | 240 | valid_citations = valid_citations.withColumn('three_point_class', F.when(F.col(column_name) >= top_1_score, F.lit('B')).otherwise(F.col('three_point_class')) ) |
238 | | -valid_citations = valid_citations.withColumn('three_point_class', F.when(F.col(column_name) >= top_001_score, F.lit('A')).otherwise(F.col('three_point_class')) ) |
| 241 | +valid_citations = valid_citations.withColumn('three_point_class', F.when(F.col(column_name) >= top_001_score, F.lit('A')).otherwise(F.col('three_point_class')) ) |
239 | 242 | valid_citations = valid_citations.select(F.regexp_replace('paper', 'comma_char', ',').alias('doi'), column_name, 'normalized_' + column_name, 'three_point_class') |
240 | 243 |
|
241 | 244 | # Add six point class to score dataframe |
|
0 commit comments