|
99 | 99 | elif mode == 'distributed': |
100 | 100 | print("\n\nReading input from hdfs\n\n") |
101 | 101 | # Use spark session with schema instead of spark context and text file (this should spead up reading the file) |
102 | | - input_data = spark.read.schema(graph_file_schema).option('delimiter', '\t').csv(input_file).repartition(num_partitions, "paper") |
| 102 | + input_data = spark.read.schema(graph_file_schema).option('delimiter', '\t').csv(input_file).repartition("paper") |
103 | 103 | ##################################################################################################### |
104 | 104 | # Time initialization |
105 | 105 | initialisation_time = time.time() |
106 | 106 | # Print out info messages about the program's parameters |
107 | 107 | print ("Mode is: " + mode) |
108 | | -print ("Num Partitions: " + str(num_partitions)) |
| 108 | +#print ("Num Partitions: " + str(num_partitions)) |
109 | 109 | print ("Limit year: " + str(limit_year)) |
110 | 110 | print ("\n\n") |
111 | 111 | # Initialise SPARK Data |
|
116 | 116 | .select('paper', 'cited_papers', F.expr('size(cited_papers)-2').alias("cited_paper_size"), 'pub_year')\ |
117 | 117 | .select('paper', F.expr("slice(cited_papers, 1, cited_paper_size)").alias('cited_papers'), 'pub_year')\ |
118 | 118 | .select('paper', F.array_join('cited_papers', '|').alias('cited_papers'), 'pub_year')\ |
119 | | - .select('paper', F.split('cited_papers', ',').alias('cited_papers'), 'pub_year').repartition(num_partitions, 'pub_year').cache() |
| 119 | + .select('paper', F.split('cited_papers', ',').alias('cited_papers'), 'pub_year').repartition('pub_year').cache() |
120 | 120 |
|
121 | 121 | # Create a dataframe with nodes filtered based on whether they cite others or not. Here we keep those that make citations (i.e., remove dangling nodes) |
122 | 122 | print ("Planning removal of dangling nodes...") |
123 | 123 | outlinks_actual = outlinks.filter(outlinks['cited_papers'][0] != '0')\ |
124 | | - .select('paper', F.explode(F.col('cited_papers')).alias('cited_paper') , F.col('pub_year')).repartition(num_partitions, "paper").cache() |
| 124 | + .select('paper', F.explode(F.col('cited_papers')).alias('cited_paper') , F.col('pub_year')).repartition("paper").cache() |
125 | 125 |
|
126 | 126 | # If offset year is given, we need to perform some filtering of citations based on pub year. Proceed by normally calculating the 3-year based CC |
127 | 127 | if limit_year: |
128 | 128 | # We now need to filter out those records where citing year - cited year > limit_year |
129 | 129 | # a. join again with years, based on cited paper year - create a clone of the initial dataframe, because otherwise there will be an error due to similar column names |
130 | 130 | print ("Gathering years of cited papers...") |
131 | | - cited_paper_years = outlinks.select('paper', F.col('pub_year').alias('cited_paper_year')).withColumnRenamed('paper', 'cited_paper').repartition(num_partitions, 'cited_paper') |
| 131 | + cited_paper_years = outlinks.select('paper', F.col('pub_year').alias('cited_paper_year')).withColumnRenamed('paper', 'cited_paper').repartition('cited_paper') |
132 | 132 | # Since here outlinks_actual is joined on cited paper, we need to repartition it |
133 | | - valid_citations = outlinks_actual.repartition(num_partitions, 'cited_paper').join(cited_paper_years, outlinks_actual.cited_paper == cited_paper_years.cited_paper)\ |
| 133 | + valid_citations = outlinks_actual.repartition('cited_paper').join(cited_paper_years, outlinks_actual.cited_paper == cited_paper_years.cited_paper)\ |
134 | 134 | .select(outlinks_actual.paper, |
135 | 135 | cited_paper_years.cited_paper, |
136 | 136 | outlinks_actual.pub_year.alias('citing_paper_year'), |
137 | 137 | cited_paper_years.cited_paper_year)\ |
138 | | - .repartition(num_partitions, 'paper') |
| 138 | + .repartition('paper') |
139 | 139 |
|
140 | 140 | # b. Filter out those where citing paper year > cited paper year + 3 |
141 | 141 | print ("Filtering out citations based on pub year difference...") |
142 | | - valid_citations = valid_citations.filter(valid_citations['citing_paper_year']-valid_citations['cited_paper_year'] <= limit_year).repartition(num_partitions, 'paper').cache() |
| 142 | + valid_citations = valid_citations.filter(valid_citations['citing_paper_year']-valid_citations['cited_paper_year'] <= limit_year).repartition('paper').cache() |
143 | 143 | # Do nothing if no limit year was specified. For uniformity reasons we set the valid citations variable to point to outlinks_actual |
144 | 144 | else: |
145 | 145 | valid_citations = outlinks_actual |
146 | 146 |
|
147 | 147 | # Group by cited_paper and get counts |
148 | 148 | print("Preparing count of citations...") |
149 | | -valid_citations = valid_citations.repartition(num_partitions, 'cited_paper').groupBy('cited_paper').count().repartition(num_partitions, 'cited_paper') |
| 149 | +valid_citations = valid_citations.repartition('cited_paper').groupBy('cited_paper').count().repartition('cited_paper') |
150 | 150 |
|
151 | 151 | # Add papers which aren't cited |
152 | 152 | print("Planning addition of dangling nodes...") |
153 | 153 | # Join with papers that aren't cited |
154 | 154 | valid_citations = valid_citations.join(outlinks.select('paper'), outlinks.paper == valid_citations.cited_paper, 'right_outer')\ |
155 | 155 | .select('paper', 'count')\ |
156 | | - .fillna(0).repartition(num_partitions, 'paper').cache() |
| 156 | + .fillna(0).repartition('paper').cache() |
157 | 157 |
|
158 | 158 | print ("\n# ------------------------------------ #\n") |
159 | 159 | print("Finished planning calculations. Proceeding to calculation of scores and classes...\n") |
160 | 160 |
|
161 | 161 | # Time it |
162 | 162 | start_time = time.time() |
163 | | -max_score = valid_citations.select('count').repartition(num_partitions).distinct().agg({'count': 'max'}).collect()[0]['max(count)'] |
| 163 | +max_score = valid_citations.agg(F.max('count')).collect()[0]['max(count)'] |
164 | 164 | print ("Got max score:" + str(max_score) + " - Took {} seconds".format(time.time() - start_time) + " to get here from initial file read (this is the first transformation)") |
165 | 165 |
|
166 | 166 | # Time it |
|
186 | 186 | # Time it |
187 | 187 | start_time = time.time() |
188 | 188 | # Calculate a running count window of scores, in order to filter out papers w/ scores lower than that of the top 20% |
189 | | -distinct_scores = valid_citations.select(F.col('count').alias('cc')).repartition(num_partitions, 'cc').groupBy('cc').count()\ |
| 189 | +distinct_scores = valid_citations.select(F.col('count').alias('cc')).repartition('cc').groupBy('cc').count()\ |
190 | 190 | .withColumn('cumulative', F.sum('count').over(Window.orderBy(F.col('cc').desc()))) |
191 | 191 | distinct_scores_count = distinct_scores.count() |
192 | 192 | print ("Calculated distinct scores num (" + str(distinct_scores_count) + "), time: {} seconds ---".format(time.time() - start_time)) |
|
238 | 238 | .withColumn('normalized_' + column_name, F.lit(F.col(column_name)/float(max_score)))\ |
239 | 239 | .withColumn('three_point_class', F.lit('C')) |
240 | 240 | valid_citations = valid_citations.withColumn('three_point_class', F.when(F.col(column_name) >= top_1_score, F.lit('B')).otherwise(F.col('three_point_class')) ) |
241 | | -valid_citations = valid_citations.withColumn('three_point_class', F.when(F.col(column_name) >= top_001_score, F.lit('A')).otherwise(F.col('three_point_class')) ) |
| 241 | +valid_citations = valid_citations.withColumn('three_point_class', F.when(F.col(column_name) >= top_001_score, F.lit('A')).otherwise(F.col('three_point_class')) ) |
242 | 242 | valid_citations = valid_citations.select(F.regexp_replace('paper', 'comma_char', ',').alias('doi'), column_name, 'normalized_' + column_name, 'three_point_class') |
243 | 243 |
|
244 | 244 | # Add six point class to score dataframe |
|
0 commit comments