wordcloud/words.py at master · mstdn-workers/wordcloud · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import numpy as np

from wordcloud import WordCloud, ImageColorGenerator
from natto import MeCab

def mecab_analysis(text):
    import os
    mecab_flags = [
        f'-d {os.popen("mecab-config --dicdir").read().strip()}/mecab-ipadic-neologd/',
        '-u username.dic',
    ]
    t = MeCab(' '.join(mecab_flags))
    enc_text = text.strip() # MeCabに渡した文字列は必ず変数に入れておく https://shogo82148.github.io/blog/2012/12/15/mecab-python/
    t.parse('') # UnicodeDecodeError対策 http://taka-say.hateblo.jp/entry/2015/06/24/183748
    # node = t.parseToNode(enc_text)
    output = []
    for node in t.parse(enc_text, as_nodes=True):
        if node.surface != "":  # ヘッダとフッタを除外
            word_type = node.feature.split(",")[0]
            if word_type in ["形容詞", "名詞", "副詞"]:
                output.append(node.surface)
    return output

def get_content_from_status(status):
    return status['spoiler_text'] or status['content']

def convert_content(content):
    import re, html
    content = re.sub('<[^>]*>', '', content)
    content = re.sub(r'https?://[^ ]+', "", content)
    content = html.unescape(content)
    content = re.sub(r"＿[人 ]+＿\s*＞([^＜]+)＜\s*￣(Y\^)+Y￣", r"\1", content)
    content = re.sub("　", "", content)
    return content

def wordlist_from_statuses(statuses):
    # 全トゥートを結合して形態素解析に流し込んで単語に分割する
    wordlist = mecab_analysis(' '.join(
        convert_content(get_content_from_status(s)) for s in statuses))
    return wordlist

def get_wordcloud_from_wordlist(wordlist, background_image='background', slow_connection_mode=False):
    from PIL import Image

    fpath = "/usr/share/fonts/opentype/noto/NotoSansCJK-Medium.ttc"

    # ストップワードの設定
    stop_words = [
        'てる', 'いる', 'なる', 'れる', 'する', 'ある', 'ない',
        'くれる', 'やる', 'くださる', 'そう', 'せる', 'した', 'して',
        'て', 'に', 'を', 'は', 'の', 'が', 'と', 'た', 'し', 'で', 'も', 'な', 'い', 'か',
        'こと', 'これ', 'それ', 'ここ', 'もの',
        'ので', 'よう',
        'いい',
        '思う',
        '人', '気', '何',
        '私', '僕', '自分', 'やつ', 'さん', 'くん', 'ちゃん',
        '今日', '今', 'とき', 'まだ', 'もう', 'みたい',
    ]

    img_array = np.array(Image.open(background_image))

    pastel_colors = [f"hsl({hue}, 25%, 66%)" for hue in [0, 60, 120, 180]]
    def pastel_color_func(word, font_size, position, orientation, random_state=None,
                          **kwargs):
        import random
        return pastel_colors[random.randint(0, 3)]

    wordcloud = WordCloud(regexp=r"\w[\w']*",
                          normalize_plurals=False,
                          background_color="white",
                          font_path=fpath,
                          mask=img_array,
                          color_func=pastel_color_func if slow_connection_mode else ImageColorGenerator(img_array),
                          scale=1.5,
                          stopwords=set(stop_words),
#                          max_font_size=55,
                         )
    text = ' '.join(wordlist)
    words = wordcloud.process_text(text)
    wordcloud.generate_from_frequencies(words)

    if slow_connection_mode:
        (wordcloud.to_image()
            .resize((400, 400), resample=Image.BOX)
            .convert(mode="P", palette=Image.ADAPTIVE, colors=8)
            .save('/tmp/wordcloud.png'))
    else:
        wordcloud.to_file("/tmp/wordcloud.png")

    return wordcloud, words