Skip to content

Commit 8b58470

Browse files
authored
Code refactoring august 2022 (#22)
1 parent 55ae4d8 commit 8b58470

19 files changed

+1623
-747
lines changed

Dockerfile

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,11 @@ FROM python:3.9.6-buster
22

33
WORKDIR /root/Horoscope
44

5+
# INSTALL DEPENDENCIES FOR DOCTR
6+
RUN apt-get update \
7+
&& apt-get install -y libgl1-mesa-dev libsm6 libxext6 libxrender-dev \
8+
tesseract-ocr tesseract-ocr-fra
9+
510
# INSTALL PIPENV
611
RUN pip3 install pipenv
712

@@ -12,7 +17,8 @@ RUN pipenv install --dev --system --deploy
1217
#--system — Use the system pip command rather than the one from your virtualenv.
1318
#--deploy — Make sure the packages are properly locked in Pipfile.lock, and abort if the lock file is out-of-date.
1419

15-
RUN apt-get update &&\
16-
apt-get install -y tesseract-ocr tesseract-ocr-fra
20+
# FIX THIS
21+
# Unable to add extras in Pipfile for DocTR, pipenv raises an error
22+
RUN pip3 install python-doctr['torch']
1723

18-
CMD python3 -u horoscope_bot.py
24+
CMD python3 -u bot.py

Makefile

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,16 @@
11
build:
2-
docker build -t horoscope .
2+
docker-compose build
3+
up:
4+
docker-compose -p horoscope up -d
5+
down:
6+
docker-compose -p horoscope down -t 0
7+
38
run:
49
docker run -d -v $(PWD):/root/Horoscope --name horoscope horoscope
510
build_lock:
611
docker build -t lock -f Dockerfile.lock .
712
lock:
813
docker run -v $(PWD):/root/Horoscope --rm lock pipenv lock
14+
15+
run_selenium:
16+
docker run --rm -d -p 4444:4444 --name selenium-horoscope --shm-size=2g selenium/standalone-chrome

Pipfile

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ verify_ssl = true
44
name = "pypi"
55

66
[packages]
7-
discord = "*"
7+
discord.py = "*"
88
asyncio = "*"
99
aiohttp = "*"
1010
pytz = "*"
@@ -18,6 +18,8 @@ requests = "*"
1818
matplotlib = "*"
1919
Pillow = "*"
2020
nest-asyncio = "*"
21+
selenium = "*"
22+
python-doctr = "*"
2123

2224
[dev-packages]
2325

Pipfile.lock

Lines changed: 1292 additions & 569 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.
Lines changed: 35 additions & 120 deletions
Original file line numberDiff line numberDiff line change
@@ -1,24 +1,27 @@
11
# coding: utf8
22
import discord
3+
import logging
34
import asyncio
45
import re
56
import os
6-
import pytz
77
import pickle
88
import datetime as dt
9-
import numpy as np
9+
10+
logging.basicConfig(
11+
level=logging.INFO,
12+
format='%(asctime)s | %(name)s | %(levelname)s | %(message)s'
13+
)
1014

1115
from PIL import Image
12-
from collections import Counter
1316
from typing import Optional, List
1417

1518
from my_constants import TOKEN, IMG_FOLDER, channel_horoscope
16-
from scraper import get_last_images, download_image
17-
from parse import parse_horoscope, reformat_horoscope
18-
from utils import convert_timedelta, md5
19+
from rtl2_horoscope.scraper.facebook import FacebookScraper
20+
from rtl2_horoscope.parse import parse_horoscope, reformat_horoscope
21+
from rtl2_horoscope.utils import now
1922

20-
import nest_asyncio
21-
nest_asyncio.apply()
23+
#import nest_asyncio
24+
#nest_asyncio.apply()
2225

2326

2427
manual = """
@@ -29,78 +32,29 @@
2932
```
3033
"""
3134

32-
tz_paris = pytz.timezone("Europe/Paris")
3335
TIMESTAMP_FORMAT = "%Y-%m-%d"
34-
USERNAME = "RTL2officiel"
35-
36-
# top,left,bottow,right
37-
true_width, true_height = 2362, 3431
38-
# True Horoscope has the following
39-
# color proportions
40-
true_occurences = Counter({1: 960179, 0: 750054, 2: 179367})
41-
# Tested on a header of size true_width * crop_height
42-
crop_height = 800
43-
true_proportions = np.array([true_occurences[0], true_occurences[1], true_occurences[2]])/(true_width * crop_height)
44-
rtl2_header = np.array([0, 0, true_width, crop_height])
45-
46-
kmeans = pickle.load(open("horoscope_kmeans.pickle", "rb"))
47-
48-
def now():
49-
return dt.datetime.now().astimezone(tz_paris)
50-
51-
def is_horoscope(filename, verbose=False):
52-
"""Check if it is a horoscope or not
53-
Step 1 : check the picture size
54-
Step 2 : use pretrained KMeans to compare color proporitons
55-
56-
Args:
57-
filename (str) : path to horoscope
58-
59-
Return:
60-
Bool : return True if it is an horoscope, False otherwise
61-
"""
62-
assert os.path.isfile(filename), "Invalid file name"
63-
64-
# Step 1
65-
photo = Image.open(filename)
66-
width, height = photo.size
67-
68-
print(f"Image size : {width}x{height}")
69-
if abs(width/true_width - height/true_height) > 0.05 :
70-
return False
71-
print(f"Ratio de l'image correct.")
72-
73-
# Step 2
74-
k = width/true_width
75-
pixels = np.array(photo.crop(tuple(k*rtl2_header))).reshape(-1, 3)
76-
occurences = Counter(kmeans.predict(pixels))
77-
proportions = np.array([occurences[0], occurences[1], occurences[2]])/(k*true_width * k*crop_height)
78-
if verbose:
79-
print(proportions, "Image proportions")
80-
print(true_proportions, "True proportions")
81-
print(np.sum(np.abs(true_proportions - proportions)), "Distance")
82-
return np.sum(np.abs(true_proportions - proportions)) < 0.03
83-
8436

8537

8638
class HoroscopeDiscordBot(discord.Client):
8739
def __init__(self, *args, **kwargs):
8840
super().__init__(*args, **kwargs)
41+
self.scraper = FacebookScraper()
8942

43+
async def setup_hook(self):
9044
# create the background task and run it in the background
91-
self.bg_task = self.loop.create_task(self.job())
45+
self.loop.create_task(self.job())
9246

9347
async def on_ready(self):
9448
"""Initial check"""
9549
if not os.path.isdir(IMG_FOLDER):
96-
print(f"Création du dossier {IMG_FOLDER}")
50+
logging.info(f"Création du dossier {IMG_FOLDER}")
9751
os.mkdir(IMG_FOLDER)
9852

99-
print(f"[{now().ctime()}] - Bot ready :-)")
100-
print('Logged in as')
101-
print(self.user.name)
102-
print(self.user.id)
103-
print('------')
53+
logging.info("Bot ready :-)")
54+
logging.info('Logged in as')
55+
logging.info(self.user.name)
56+
logging.info(self.user.id)
57+
logging.info('------')
10458

10559
def is_for_bot(self, message) -> bool:
10660
"""Check if the message is for the bot.
@@ -147,9 +101,8 @@ async def job(self, fetch_interval=300, days=[0,1,2,3,4], hours=[9,10,11,12]):
147101
await asyncio.sleep(fetch_interval)
148102

149103
time_to_wait = self.get_time_to_wait(hours).total_seconds()
150-
time_to_wait_message = f"[{now().ctime()}] - " +\
151-
f"Reprise de l'activité dans {time_to_wait} secondes."
152-
print(time_to_wait_message)
104+
time_to_wait_message = f"Reprise de l'activité dans {time_to_wait} secondes."
105+
logging.info(time_to_wait_message)
153106

154107
await asyncio.sleep(time_to_wait)
155108

@@ -167,9 +120,8 @@ async def on_message(self, message):
167120
img_href = message.content.split(" ")[-1]
168121
if img_href.startswith("http") and await self.fetch_new_horoscope(img_href=img_href):
169122
time_to_wait = self.get_time_to_wait([10,11,12]).total_seconds()
170-
time_to_wait_message = f"[{now().ctime()}] - " +\
171-
f"Reprise de l'activité dans {time_to_wait} secondes."
172-
print(time_to_wait_message)
123+
time_to_wait_message = f"Reprise de l'activité dans {time_to_wait} secondes."
124+
logging.info(time_to_wait_message)
173125
await asyncio.sleep(time_to_wait)
174126

175127
if self.command(message, "last"):
@@ -182,62 +134,23 @@ async def on_message(self, message):
182134

183135
async def parse_and_send_horoscope(self, filename):
184136
"""Parse the image and send the image and the text found through OCR"""
185-
print("OCR : en cours.")
137+
logging.info("OCR : en cours.")
186138
horoscope_dict = parse_horoscope(filename, threads=1)
187139
horoscope_str = reformat_horoscope(horoscope_dict)
188-
print("OCR : terminé.")
140+
logging.info("OCR : terminé.")
189141
await self.get_channel(channel_horoscope).send(file=discord.File(filename))
190142
await self.get_channel(channel_horoscope).send(horoscope_str)
191143

192-
async def fetch_new_horoscope(self, img_href: Optional[str] = None):
193-
"""Get last image from RTL2 Twitter page, check if it's a new horoscope (using md5)
144+
async def fetch_new_horoscope(self, img_href: Optional[str] = None) -> bool:
145+
"""Get last image from RTL2 social media page, check if it's a new horoscope
194146
and send the file on Discord
195147
Args:
196148
img_href : if not None, download the image from <img_href> url
197149
"""
198-
199-
print(f"[{now().ctime()}] - Fetch Horoscope")
200-
if img_href:
201-
print(f"[{now().ctime()}] - Lien fourni par l'utilisateur : {img_href}.")
202-
img_hrefs = [img_href]
203-
else:
204-
print(f"[{now().ctime()}] - Récupération des dernières images depuis Twitter.")
205-
today = now().strftime("%Y-%m-%d")
206-
img_hrefs = get_last_images(username=USERNAME, since=today)
207-
208-
files = sorted(os.listdir(IMG_FOLDER + "/"), reverse=True)
209-
210-
if len(img_hrefs) > 0:
211-
print("Téléchargement des images...")
212-
else:
213-
print("Pas d'images tweetées aujourd'hui !")
214-
215-
for img_href in img_hrefs:
216-
filename = await download_image(img_href)
217-
218-
new_image = IMG_FOLDER + "/" + files[0]
219-
220-
if len(files) >= 1:
221-
old_image = IMG_FOLDER + "/" + files[1]
222-
else:
223-
old_image = ""
224-
225-
print(f"Test de l'image {img_href}")
226-
if is_horoscope(new_image, verbose=True):
227-
print("C'est un horoscope !")
228-
if md5(new_image) == md5(old_image):
229-
print("C'est l'horoscope d'hier")
230-
# Stop research
231-
return False
232-
else:
233-
print("C'est l'horoscope du jour")
234-
await self.parse_and_send_horoscope(new_image)
235-
# Stop research
236-
return True
237-
else:
238-
print("Ce n'est pas un nouveau horoscope")
239-
# Continue research
240-
150+
horoscope = await self.scraper.fetch_new_horoscope(img_href)
151+
if horoscope:
152+
await self.parse_and_send_horoscope(horoscope)
153+
return True
241154
return False
242155

243156
def get_time_to_wait(self, hours):
@@ -258,5 +171,7 @@ def get_time_to_wait(self, hours):
258171
return next_day-today
259172

260173
if __name__ == "__main__":
261-
client = HoroscopeDiscordBot()
174+
intents = discord.Intents.default()
175+
intents.message_content = True
176+
client = HoroscopeDiscordBot(intents=intents)
262177
client.run(TOKEN)

docker-compose.yml

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
version: '3.7'
2+
3+
services:
4+
5+
selenium-horoscope:
6+
image: selenium/standalone-chrome
7+
container_name: selenium-horoscope
8+
shm_size: "2gb"
9+
10+
bot:
11+
build:
12+
context: .
13+
dockerfile: Dockerfile
14+
image: horoscope-bot
15+
volumes:
16+
- ./:/root/Horoscope/
17+
container_name: horoscope
18+
depends_on:
19+
- "selenium-horoscope"
20+

parse.py renamed to rtl2_horoscope/parse.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
from concurrent.futures import ThreadPoolExecutor
33
from itertools import repeat
44
import re
5+
from pathlib import Path
56

67
import pandas as pd
78
pd.options.mode.chained_assignment = None
@@ -56,6 +57,7 @@ def read_crop(img, crop_region=None, pb=None):
5657
Args:
5758
img (PIL.Image): Image to read from.
5859
crop_region (tuple of ints): Coordinates of the rectangle containing the text to read.
60+
pb (tqdm progress bar)
5961
6062
Returns:
6163
str: The text as read by Tesseract.
@@ -232,24 +234,24 @@ def parse_horoscope(img, threads=12, verbose=True):
232234
'poisson': ('bronze',
233235
'Votre corps réclame une pause, ne tirez \\pas trop sur la corde.')}
234236
"""
235-
if isinstance(img, str):
237+
if isinstance(img, str) or isinstance(img, Path):
236238
img = Image.open(img)
237239
img.load()
238-
240+
239241
# Rescale regions if necessary
240242
factor = img.width/true_width
241243
print("scale factor:", factor)
242244
if factor != 1:
243245
scaled_regions = scale_regions(factor, regions)
244246
else:
245247
scaled_regions = regions
246-
248+
247249
# Read and clean up texts
248250
texts = read_texts(img, threads=threads, regions=scaled_regions, verbose=verbose)
249251
texts = {sign: clean_up_text(text) for sign, text in texts.items()}
250-
252+
251253
stars = find_star_colors(img, regions=scaled_regions, robust=True)
252-
254+
253255
keys = texts.keys()
254256
values = zip(stars.values(), texts.values())
255257
out = dict(zip(keys, values))

rtl2_horoscope/scraper/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
from .scraper import Scraper

rtl2_horoscope/scraper/facebook.py

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
import logging
2+
3+
from selenium import webdriver
4+
from bs4 import BeautifulSoup
5+
from typing import List
6+
7+
from rtl2_horoscope.scraper import Scraper
8+
9+
ALBUM_URL = "https://www.facebook.com/pg/rtl2/photos/?tab=album&album_id=248389291078&ref=page_internal"
10+
WEBDRIVER_URL = 'http://selenium-horoscope:4444/wd/hub'
11+
12+
class FacebookScraper(Scraper):
13+
14+
def __init__(self, album_url: str = ALBUM_URL, webdriver_url: str = WEBDRIVER_URL):
15+
super().__init__(social_media="facebook")
16+
self.album_url = album_url
17+
self.webdriver_url = webdriver_url
18+
19+
def get_last_images(self, **kwargs) -> List[str]:
20+
21+
logging.info("Initialize Webdriver")
22+
driver = webdriver.Remote(
23+
self.webdriver_url,
24+
options=webdriver.ChromeOptions()
25+
)
26+
driver.set_window_size(1280, 1024)
27+
logging.info(f"Get {self.album_url} ...")
28+
driver.get(self.album_url)
29+
logging.info("Done")
30+
page_source = driver.page_source
31+
driver.close()
32+
33+
soup = BeautifulSoup(page_source, features="html5lib")
34+
35+
hrefs = []
36+
37+
for a in soup.find_all("a"):
38+
for img in a.find_all("img"):
39+
hrefs.append(img['src'])
40+
41+
return hrefs
42+

0 commit comments

Comments
 (0)