1+ <?php
2+ namespace App ;
3+
4+ use Exception ;
5+ use simple_html_dom ;
6+ use simple_html_dom_node ;
7+
8+ class Parser
9+ {
10+ protected $ forumURL ;
11+
12+ protected $ curl ;
13+
14+ protected $ defaultUserAgent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 ' .
15+ '(KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36 ' ;
16+
17+ protected $ userAgent ;
18+
19+ protected $ logger ;
20+
21+ protected $ htmlDom ;
22+
23+ public function __construct ()
24+ {
25+ $ this ->forumURL = getenv ('FORUM_URL ' );
26+ $ this ->userAgent = getenv ('USER_AGENT ' ) ?: $ this ->defaultUserAgent ;
27+
28+ $ this ->logger = new Logger ();
29+
30+ $ this ->htmlDom = new simple_html_dom ();
31+ }
32+
33+ /**
34+ * @throws Exception
35+ */
36+ public function handle ()
37+ {
38+ if (!$ this ->forumURL ){
39+ throw new Exception ('Forum URL is empty ' );
40+ }
41+
42+ #region Auth action
43+ $ authAction = $ this ->authAction ();
44+ $ cookies = '' ;
45+
46+ if ($ authAction ){
47+ $ this ->curl = curl_init ($ authAction );
48+ curl_setopt ($ this ->curl , CURLOPT_FRESH_CONNECT , true );
49+ curl_setopt ($ this ->curl , CURLOPT_RETURNTRANSFER , true );
50+ curl_setopt ($ this ->curl , CURLOPT_COOKIESESSION , false );
51+ curl_setopt ($ this ->curl , CURLOPT_POST , true );
52+ curl_setopt ($ this ->curl , CURLOPT_HEADER , true );
53+ curl_setopt ($ this ->curl , CURLOPT_POSTFIELDS , [
54+ 'vb_login_username ' => getenv ('AUTH_USERNAME ' ),
55+ 'vb_login_md5password ' => md5 (getenv ('AUTH_PASSWORD ' )),
56+ 'vb_login_md5password_utf ' => md5 (getenv ('AUTH_PASSWORD ' )),
57+ 'securitytoken ' => 'guest ' ,
58+ 'cookieuser ' => 1 ,
59+ 'do ' => 'login ' ,
60+ 'vb_login_password ' => ''
61+ ]);
62+
63+ curl_setopt ($ this ->curl , CURLOPT_USERAGENT , $ this ->userAgent );
64+
65+ $ result = curl_exec ($ this ->curl );
66+
67+ preg_match_all ('/Set-Cookie:\s*([^;]*)/ ' , $ result , $ matches );
68+
69+ foreach ($ matches [1 ] as $ item ) {
70+ $ cookies .= $ cookies ? '; ' .$ item : $ item ;
71+ }
72+ } else {
73+ $ this ->logger ->addWarning ('Auth action is empty ' );
74+ }
75+ #endregion
76+
77+ if (!getenv ('THEME_URL ' )){
78+ throw new Exception ('Theme URL is empty ' );
79+ }
80+
81+ curl_setopt ($ this ->curl , CURLOPT_URL , getenv ('THEME_URL ' ));
82+ curl_setopt ($ this ->curl , CURLOPT_HTTPGET , true );
83+ curl_setopt ($ this ->curl , CURLOPT_COOKIESESSION , false );
84+ curl_setopt ($ this ->curl , CURLOPT_COOKIE , $ cookies );
85+
86+ $ result = curl_exec ($ this ->curl );
87+
88+ curl_close ($ this ->curl );
89+
90+ $ html = $ this ->htmlDom ->load ($ result );
91+
92+ if (!getenv ('MESSAGE_POST_CLASS ' )){
93+ throw new Exception ('Message post class is empty ' );
94+ }
95+
96+ $ messages = $ html ->find ('. ' .getenv ('MESSAGE_POST_CLASS ' ));
97+
98+ $ theme = $ html ->find ('title ' , 0 )->text ();
99+
100+ if ($ messages ){
101+ /** @var simple_html_dom_node $message */
102+ foreach ($ messages as $ message ){
103+ $ this ->saveMessage ($ message , $ theme );
104+ }
105+ } else {
106+ $ this ->logger ->addWarning ('Messages are empty ' );
107+ }
108+ }
109+
110+ /**
111+ * @return string
112+ * @throws Exception
113+ */
114+ protected function authAction (): string
115+ {
116+ if (getenv ('AUTH_URL ' )) {
117+ return getenv ('AUTH_URL ' );
118+ }
119+
120+ $ curl = curl_init ($ this ->forumURL );
121+
122+ curl_setopt ($ curl , CURLOPT_FRESH_CONNECT , true );
123+ curl_setopt ($ curl , CURLOPT_RETURNTRANSFER , true );
124+ curl_setopt ($ curl , CURLOPT_USERAGENT , $ this ->userAgent );
125+
126+ $ result = curl_exec ($ curl );
127+
128+ curl_close ($ curl );
129+
130+ $ html = $ this ->htmlDom ->load ($ result );
131+
132+ $ formID = getenv ('AUTH_FORM_ID ' );
133+
134+ /** @var simple_html_dom_node $form */
135+ $ form = $ html ->find ('# ' .$ formID , 0 );
136+
137+ if (!$ form ){
138+ $ this ->logger ->addError ('Form is empty on the page ' );
139+
140+ return '' ;
141+ }
142+
143+ return $ this ->forumURL .$ form ->attr ['action ' ];
144+ }
145+
146+ protected function postsFolder (): string
147+ {
148+ return __DIR__ .'/../posts ' ;
149+ }
150+
151+ protected function saveMessage (simple_html_dom_node $ message , string $ theme )
152+ {
153+ if (!isset ($ message ->attr ['id ' ])){
154+ return $ this ;
155+ }
156+
157+ $ isMessage = preg_match ('/^post/ ' , $ message ->attr ['id ' ]);
158+
159+ if ($ isMessage === false ){
160+ return $ this ;
161+ }
162+
163+ $ postDate = $ message ->find ('. ' .getenv ('MESSAGE_DATE_CLASS ' ), 0 );
164+
165+ $ date = $ postDate
166+ ? preg_replace ('/[ ]+/ ' , ' ' , trim ($ postDate ->find ('.date ' , 0 )->text ()))
167+ : null ;
168+
169+ $ clearDate = $ postDate
170+ ? preg_replace ('/[ \.:]+/ ' , '- ' , trim ($ date ))
171+ : null ;
172+
173+ $ fileName = $ theme .'- ' .($ clearDate ?: time ()).'.txt ' ;
174+
175+ $ titleDOM = $ message ->find ('h2.title ' , 0 );
176+
177+ /** @var simple_html_dom_node $userNameDOM */
178+ $ userNameDOM = $ message ->find ('.userinfo ' , 0 )->find ('.username ' , 0 );
179+
180+ /** @var simple_html_dom_node $postContentDOM */
181+ $ postContentDOM = $ message ->find ('.postbody ' , 0 )->find ('blockquote.postcontent ' , 0 );
182+
183+ $ messageTitle = $ titleDOM ? $ titleDOM ->text () : '' ;
184+ $ messageAuthor = $ userNameDOM ? $ userNameDOM ->text () : '' ;
185+ $ messageText = $ postContentDOM ? $ postContentDOM ->text () : '' ;
186+
187+ $ fileContent = trim ($ messageTitle ).PHP_EOL .$ messageAuthor .PHP_EOL .$ date .PHP_EOL .trim ($ messageText );
188+
189+ file_put_contents ($ this ->postsFolder ().DIRECTORY_SEPARATOR .$ fileName , $ fileContent );
190+
191+ return $ this ;
192+ }
193+ }
0 commit comments