Skip to content

Commit 4d2f5b7

Browse files
committed
base parser structure
1 parent 24562f4 commit 4d2f5b7

File tree

5 files changed

+1847
-2
lines changed

5 files changed

+1847
-2
lines changed

.gitignore

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
# Created by .ignore support plugin (hsz.mobi)
2+
### JetBrains template
3+
# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm
4+
# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
5+
6+
# User-specific stuff:
7+
.idea/
8+
9+
### macOS template
10+
# General
11+
.DS_Store
12+
.AppleDouble
13+
.LSOverride
14+
15+
# Icon must end with two \r
16+
Icon
17+
18+
# Thumbnails
19+
._*
20+
21+
# Files that might appear in the root of a volume
22+
.DocumentRevisions-V100
23+
.fseventsd
24+
.Spotlight-V100
25+
.TemporaryItems
26+
.Trashes
27+
.VolumeIcon.icns
28+
.com.apple.timemachine.donotpresent
29+
30+
# Directories potentially created on remote AFP share
31+
.AppleDB
32+
.AppleDesktop
33+
Network Trash Folder
34+
Temporary Items
35+
.apdisk
36+
### Linux template
37+
*~
38+
39+
# temporary files which can be created if a process still has a handle open of a deleted file
40+
.fuse_hidden*
41+
42+
# KDE directory preferences
43+
.directory
44+
45+
# Linux trash folder which might appear on any partition or disk
46+
.Trash-*
47+
48+
# .nfs files are created when an open file is removed but is still being accessed
49+
.nfs*
50+

README.md

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1 @@
1-
# techtask
2-
PHP technical task
1+
In this branch only parser

index.php

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
<?php
2+
include_once 'simple_html_dom.php';
3+
4+
$curl = curl_init('http://forumodua.com/showthread.php?t=851487');
5+
6+
curl_setopt($curl, CURLOPT_FRESH_CONNECT, true);
7+
curl_setopt($curl, CURLOPT_RETURNTRANSFER, true);
8+
curl_setopt($curl, CURLOPT_USERAGENT, 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36');
9+
curl_setopt($curl, CURLOPT_HTTPHEADER, [
10+
"Content-Type: text/xml; charset=utf-8",
11+
"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
12+
"Accept-Language: ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7,uk;q=0.6"
13+
]);
14+
15+
$result = curl_exec($curl);
16+
17+
curl_close($curl);
18+
19+
$html = str_get_html($result);
20+
21+
$messages = $html->find('.postbitlegacy');
22+
$theme = $html->find('title', 0)->text();
23+
$folder = __DIR__.DIRECTORY_SEPARATOR.'posts';
24+
25+
if($messages){
26+
/** @var simple_html_dom_node $message */
27+
foreach($messages as $message){
28+
if(!isset($message->attr['id'])){
29+
continue;
30+
}
31+
32+
$isMessage = preg_match('/^post/', $message->attr['id']);
33+
34+
if($isMessage === false){
35+
continue;
36+
}
37+
38+
$postDate = $message->find('.postdate', 0);
39+
$date = $postDate->find('.date', 0)->text();
40+
$clearDate = preg_replace('/[ &nbsp;\.:]+/', '-', trim($date));
41+
42+
$fileName = $theme.'-'.$clearDate.'.txt';
43+
44+
$messageTitle = $message->find('h2.title', 0) ? $message->find('h2.title', 0)->text() : '';
45+
$messageAuthor = $message->find('.userinfo', 0)->find('.username', 0)->text();
46+
$messageText = $message->find('.postbody', 0)->find('blockquote.postcontent', 0)->text();
47+
48+
$fileContent = $messageTitle.PHP_EOL.$messageAuthor.PHP_EOL.$messageText;
49+
50+
file_put_contents($folder.DIRECTORY_SEPARATOR.$fileName, $fileContent);
51+
}
52+
}

posts/.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
*
2+
!.gitignore

0 commit comments

Comments
 (0)