cc-crawl-statistics/plot/histogram.py at cc11aedb4c26f708aae318c0f9d8e73f9bcec0bc · malteos/cc-crawl-statistics · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
import os.path
import pandas
import sys

from collections import defaultdict

from crawlstats import CST

from rpy2.robjects.lib import ggplot2
from rpy2.robjects import pandas2ri

from crawlplot import CrawlPlot, PLOTDIR, GGPLOT2_THEME, GGPLOT2_THEME_KWARGS

pandas2ri.activate()


class CrawlHistogram(CrawlPlot):

    PSEUDO_LOG_BINS = [0, 1, 2, 5, 10, 20, 50, 100, 200, 500, 1000, 2000, 5000,
                       10000, 20000, 50000, 100000, 200000, 500000, 1000000,
                       2*10**6, 5*10**6, 10**7, 2*10**7, 5*10**7, 10**8,
                       2*10**8, 5*10**8, 10**9]
    # PSEUDO_LOG_BINS = numpy.logspace(0.0, 6.0, 19)

    def __init__(self):
        self.histogr = defaultdict(dict)
        self.N = 0

    def add(self, key, frequency):
        cst = CST[key[0]]
        if cst != CST.histogram:
            return
        item_type = key[1]
        if item_type == 'surt_domain':
            return
        crawl = key[2]
        type_counted = key[3]
        count = key[4]
        self.histogr['crawl'][self.N] = crawl
        self.histogr['type'][self.N] = item_type
        self.histogr['type_counted'][self.N] = type_counted
        self.histogr['count'][self.N] = count
        self.histogr['frequency'][self.N] = frequency
        self.N += 1

    def transform_data(self):
        self.histogr = pandas.DataFrame(self.histogr)

    def save_data(self):
        self.histogr.to_csv('data/crawlhistogr.csv')

    def plot_dupl_url(self):
        # -- pages per URL (URL-level duplicates)
        row_filter = ['url']
        data = self.histogr
        data = data[data['type'].isin(row_filter)]
        title = 'Pages per URL (URL-level duplicates)'
        p = ggplot2.ggplot(data) \
            + ggplot2.aes_string(x='count', y='frequency') \
            + ggplot2.geom_jitter() \
            + ggplot2.facet_wrap('crawl', ncol=5) \
            + ggplot2.labs(title=title, x='(duplicate) pages per URL',
                           y='log(frequency)') \
            + ggplot2.scale_y_log10()
        # + ggplot2.scale_x_log10()  # could use log-log scale
        img_path = os.path.join(PLOTDIR, 'crawler/histogr_url_dupl.png')
        p.save(img_path)
        # data.to_csv(img_path + '.csv')
        return p

    def plot_host_domain_tld(self):
        # -- pages/URLs per host / domain / tld
        data = self.histogr
        data = data[data['type'].isin(['host', 'domain', 'tld'])]
        data = data[data['type_counted'].isin(['url'])]
        img_path = os.path.join(PLOTDIR,
                                'crawler/histogr_host_domain_tld.png')
        # data.to_csv(img_path + '.csv')
        title = 'URLs per Host / Domain / TLD'
        p = ggplot2.ggplot(data) \
            + ggplot2.aes_string(x='count', weight='frequency', color='type') \
            + ggplot2.geom_freqpoly(bins=20) \
            + ggplot2.facet_wrap('crawl', ncol=4) \
            + ggplot2.labs(title='', x=title,
                           y='Frequency') \
            + ggplot2.scale_y_log10() \
            + ggplot2.scale_x_log10()
        p.save(img_path)
        return p

    def plot_domain_cumul(self, crawl):
        # -- coverage (cumulative pages) per domain
        data = self.histogr
        data = data[data['type'].isin(['domain'])]
        data = data[data['crawl'] == crawl]
        data = data[data['type_counted'].isin(['url'])]
        data['urls'] = data['count']*data['frequency']
        print(data)
        data = data[['urls', 'count', 'frequency']]
        data = data.sort_values(['count'], ascending=0)
        data['cum_domains'] = data['frequency'].cumsum()
        data['cum_urls'] = data['urls'].cumsum()
        data_perc = data.apply(lambda x: round(100.0*x/float(x.sum()), 1))
        data['%domains'] = data_perc['frequency']
        data['%urls'] = data_perc['urls']
        data['%cum_domains'] = data['cum_domains'].apply(
            lambda x: round(100.0*x/float(data['frequency'].sum()), 1))
        data['%cum_urls'] = data['cum_urls'].apply(
            lambda x: round(100.0*x/float(data['urls'].sum()), 1))
        with pandas.option_context('display.max_rows', None,
                                   'display.max_columns', None,
                                   'display.width', 200):
            print(data)
        img_path = os.path.join(PLOTDIR,
                                'crawler/histogr_domain_cumul.png')
        # data.to_csv(img_path + '.csv')
        title = 'Cumulative URLs for Top Domains'
        p = ggplot2.ggplot(data) \
            + ggplot2.aes_string(x='cum_domains', y='cum_urls') \
            + ggplot2.geom_line() + ggplot2.geom_point() \
            + GGPLOT2_THEME \
            + ggplot2.theme(**GGPLOT2_THEME_KWARGS) \
            + ggplot2.labs(title=title, x='domains cumulative',
                           y='URLs cumulative') \
            + ggplot2.scale_y_log10() \
            + ggplot2.scale_x_log10()
        p.save(img_path)
        return p


if __name__ == '__main__':
    latest_crawl = sys.argv[-1]
    plot = CrawlHistogram()
    plot.read_data(sys.stdin)
    plot.transform_data()
    plot.save_data()
    plot.plot_dupl_url()
    plot.plot_host_domain_tld()
    plot.plot_domain_cumul(latest_crawl)