Skip to content

Commit 70a4f45

Browse files
committed
Codereview and cleanup.
Used sublime with PHPCodeSniffer and Drupal Std.
1 parent 7785610 commit 70a4f45

File tree

1 file changed

+163
-133
lines changed

1 file changed

+163
-133
lines changed

os2web_cp_service.module

Lines changed: 163 additions & 133 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
*/
77

88
include_once 'os2web_cp_service.features.inc';
9-
define('DEFAULT_CASE_FILE_LIMIT', 50);//number of files to handle at once
9+
define('DEFAULT_CASE_FILE_LIMIT', 50); // Number of files to handle at once.
1010

1111
/**
1212
* Implements hook_init().
@@ -40,7 +40,7 @@ function os2web_cp_service_menu() {
4040
'page arguments' => array(4),
4141
'access callback' => TRUE,
4242
);
43-
//to delete
43+
// To manually run the cron job.
4444
$items['os2web/cp_cron'] = array(
4545
'type' => MENU_CALLBACK,
4646
'page callback' => 'os2web_cp_service_cron',
@@ -554,7 +554,7 @@ function os2web_cp_service_create_document(array $data) {
554554
}
555555
if ($is_missing) {
556556
$cnode->field_os2web_cp_service_doc_ref[LANGUAGE_NONE][]['target_id'] = $node->nid;
557-
//adding for pdf 2 html conversion, [email protected]
557+
// Adding file for pdf 2 html conversion, [email protected].
558558
os2web_cp_service_schedule_document_pdf2html_conversion($data['fields']['Indhold - FileID'], $cnode->nid);
559559
node_save($cnode);
560560
}
@@ -929,12 +929,12 @@ function os2web_cp_service_get_extension_from_mime($mime) {
929929
/**
930930
* Schedules a document for pdf to html conversion by adding the document's to database table.
931931
*
932-
* @param string $file_id id of the file on remove server
933-
* @param int $case_nid nid of the destination case, which metadata field should be updated
934-
*
935-
* @return none
932+
* @param string $file_id
933+
* Id of the file on remove server
934+
* @param int $case_nid
935+
* nid of the destination case, which metadata field should be updated
936936
*/
937-
function os2web_cp_service_schedule_document_pdf2html_conversion($file_id, $case_nid){
937+
function os2web_cp_service_schedule_document_pdf2html_conversion($file_id, $case_nid) {
938938
db_insert('os2web_cp_service_documents_conversion')
939939
->fields(array(
940940
'file_id' => $file_id,
@@ -948,190 +948,220 @@ function os2web_cp_service_schedule_document_pdf2html_conversion($file_id, $case
948948

949949
/**
950950
* Cron implementation.
951+
*
951952
* Goes through the enrties in database table,
952953
* downloads the documents as pdf,
953-
* and converts the documents from pdf to html and updates the case metadata field with the document contents.
954-
*
955-
* @return none
954+
* and converts the documents from pdf to html and updates the
955+
* case metadata field with the document contents.
956956
*/
957-
function os2web_cp_service_cron(){
958-
//download
959-
$query = db_select('os2web_cp_service_documents_conversion', 'dc');
960-
$query->fields('dc',array('file_id'))
957+
function os2web_cp_service_cron() {
958+
// Download.
959+
$query = db_select('os2web_cp_service_documents_conversion', 'dc');
960+
$query->fields('dc', array('file_id'))
961961
->isNull('dc.status')
962-
->range(0,DEFAULT_CASE_FILE_LIMIT);
963-
$result = $query->execute();
964-
while($record = $result->fetchAssoc()) {
965-
_os2web_cp_service_document_download($record['file_id']);
966-
}
962+
->range(0, DEFAULT_CASE_FILE_LIMIT);
963+
$result = $query->execute();
964+
while ($record = $result->fetchAssoc()) {
965+
os2web_cp_service_document_download($record['file_id']);
966+
}
967967

968-
//convert
969-
$query = db_select('os2web_cp_service_documents_conversion', 'dc');
970-
$query->fields('dc',array('file_id', 'filepath_pdf'))
968+
// Convert.
969+
$query = db_select('os2web_cp_service_documents_conversion', 'dc');
970+
$query->fields('dc', array('file_id', 'filepath_pdf'))
971971
->condition('dc.status', 'downloaded')
972-
->range(0,DEFAULT_CASE_FILE_LIMIT);
973-
$result = $query->execute();
974-
while($record = $result->fetchAssoc()) {
975-
_os2web_cp_service_document_convert($record['file_id'], $record['filepath_pdf']);
976-
}
972+
->range(0, DEFAULT_CASE_FILE_LIMIT);
973+
$result = $query->execute();
974+
while ($record = $result->fetchAssoc()) {
975+
_os2web_cp_service_document_convert($record['file_id'], $record['filepath_pdf']);
976+
}
977977

978-
//field updating
979-
$query = db_select('os2web_cp_service_documents_conversion', 'dc');
980-
$query->fields('dc',array('file_id', 'case_nid', 'filepath_pdf', 'filepath_html'))
978+
// Field updating.
979+
$query = db_select('os2web_cp_service_documents_conversion', 'dc');
980+
$query->fields('dc', array(
981+
'file_id',
982+
'case_nid',
983+
'filepath_pdf',
984+
'filepath_html')
985+
)
981986
->condition('dc.status', 'converted')
982-
->range(0,DEFAULT_CASE_FILE_LIMIT);
983-
$result = $query->execute();
984-
while($record = $result->fetchAssoc()) {
985-
_os2web_cp_service_update_case_metadata($record['file_id'], $record['case_nid'], $record['filepath_pdf'], $record['filepath_html']);
986-
}
987+
->range(0, DEFAULT_CASE_FILE_LIMIT);
988+
$result = $query->execute();
989+
while ($record = $result->fetchAssoc()) {
990+
_os2web_cp_service_update_case_metadata($record['file_id'], $record['case_nid'], $record['filepath_pdf'], $record['filepath_html']);
991+
}
987992
}
988993

989994
/**
990-
* Download the document file and places it into the temporary location
991-
* also updates the database entry with the created filepath
995+
* Download the document file and places it into the temporary location.
992996
*
993-
* If file is not found on the remote server, the status is changed to "ERROR: 404 not found"
994-
* If ULR returns anything else than 200 http code, the status is chaned to "ERROR: {httpCode}"
997+
* Also updates the database entry with the created filepath.
998+
* If file is not found on the remote server,
999+
* the status is changed to "ERROR: 404 not found"
9951000
*
996-
* @param $file_id string id of the document file
1001+
* If ULR returns anything else than 200 http code,
1002+
* the status is chaned to "ERROR: {http_code}"
9971003
*
998-
* @return none
1004+
* @param string $file_id
1005+
* id of the document file
9991006
*/
1000-
function _os2web_cp_service_document_download($file_id){
1001-
$url = $GLOBALS['base_url'] . '/os2web/service/gf/v1/' . $file_id;//address of remote file
1002-
$tmpfname = tempnam(file_directory_temp(), "os2web_cp_document_");//path to where the file will be downloaded
1007+
function _os2web_cp_service_document_download($file_id) {
1008+
// Address of remote file.
1009+
$url = $GLOBALS['base_url'] . '/os2web/service/gf/v1/' . $file_id;
1010+
// Path to where the file will be downloaded.
1011+
$tmpfname = tempnam(file_directory_temp(), "os2web_cp_document_");
10031012

10041013
$fp = fopen($tmpfname, 'w');
10051014

10061015
$ch = curl_init($url);
10071016
curl_setopt($ch, CURLOPT_FILE, $fp);
10081017
$data= curl_exec($ch);
10091018

1010-
//Check for 404 (file not found)
1011-
$httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
1019+
// Check for 404 (file not found).
1020+
$http_code = curl_getinfo($ch, CURLINFO_HTTP_CODE);
10121021
curl_close($ch);
10131022
fclose($fp);
10141023

1015-
if ($httpCode == 404) {
1024+
if ($http_code == 404) {
10161025
print_r('404');
10171026
db_update('os2web_cp_service_documents_conversion')
10181027
->fields(array(
1019-
'status' => 'ERROR: 404 not found',
1028+
'status' => 'ERROR: 404 not found',
10201029
))
10211030
->condition('file_id', $file_id)
10221031
->execute();
1023-
} else if ($httpCode == 200) {
1032+
}
1033+
elseif ($http_code == 200) {
10241034
db_update('os2web_cp_service_documents_conversion')
1025-
->fields(array(
1026-
'filepath_pdf' => $tmpfname,
1027-
'status' => 'downloaded',
1028-
))
1035+
->fields(array(
1036+
'filepath_pdf' => $tmpfname,
1037+
'status' => 'downloaded',
1038+
))
10291039
->condition('file_id', $file_id)
10301040
->execute();
1031-
} else {//unknown error
1041+
}
1042+
// Unknown error.
1043+
else {
10321044
db_update('os2web_cp_service_documents_conversion')
1033-
->fields(array(
1034-
'status' => 'ERROR: ' . $httpCode,
1035-
))
1045+
->fields(array(
1046+
'status' => 'ERROR: ' . $http_code,
1047+
))
10361048
->condition('file_id', $file_id)
10371049
->execute();
10381050
}
10391051
}
10401052

10411053
/**
1042-
* Converts the downloaded pdf into HTML
1043-
* also updates the database entry with the created filepath
1054+
* Converts the downloaded pdf into HTML.
10441055
*
1045-
* If PDF file is not found, the status is changes to NULL, so that the next cron job will download the file again.
1056+
* Also updates the database entry with the created filepath.
1057+
* If PDF file is not found, the status is changes to NULL,
1058+
* so that the next cron job will download the file again.
10461059
*
1047-
* @param $file_id string id of the document file
1048-
* @param $path_to_pdf string path to the pdf version of the file
1060+
* @param string $file_id
1061+
* id of the document file
10491062
*
1050-
* @return none
1063+
* @param string $path_to_pdf
1064+
* path to the pdf version of the file
10511065
*/
1052-
function _os2web_cp_service_document_convert($file_id, $path_to_pdf){
1053-
if (!file_exists($path_to_pdf)){//if does not exist, send for redownloading
1054-
db_update('os2web_cp_service_documents_conversion')
1055-
->fields(array(
1056-
'filepath_pdf' => null,
1057-
'status' => null,
1058-
))
1059-
->condition('file_id', $file_id)
1060-
->execute();
1061-
} else {
1062-
shell_exec('pdf2htmlEX ' . $path_to_pdf . ' --dest-dir ' . file_directory_temp());
1063-
db_update('os2web_cp_service_documents_conversion')
1064-
->fields(array(
1065-
'filepath_html' => $path_to_pdf . '.html',
1066-
'status' => 'converted',
1067-
))
1066+
function _os2web_cp_service_document_convert($file_id, $path_to_pdf) {
1067+
// If does not exist, send for redownloading.
1068+
if (!file_exists($path_to_pdf)) {
1069+
db_update('os2web_cp_service_documents_conversion')
1070+
->fields(array(
1071+
'filepath_pdf' => NULL,
1072+
'status' => NULL,
1073+
))
10681074
->condition('file_id', $file_id)
10691075
->execute();
1070-
}
1076+
}
1077+
else {
1078+
shell_exec('pdf2htmlEX ' . $path_to_pdf . ' --dest-dir ' . file_directory_temp());
1079+
db_update('os2web_cp_service_documents_conversion')
1080+
->fields(array(
1081+
'filepath_html' => $path_to_pdf . '.html',
1082+
'status' => 'converted',
1083+
))
1084+
->condition('file_id', $file_id)
1085+
->execute();
1086+
}
10711087
}
10721088

10731089
/**
1074-
* Takes the convent of the html, removed everything except the pure text and then appends this text to case node search metadata field.
1075-
* In the end the temp files (pdf and html) are deleted.
1090+
* Update field in case node.
10761091
*
1077-
* If HTML file is not found, the status will be changed to "downloaded" so that the text cron job will convert the file again.
1078-
* If case node is not found, the status will be chaned to "ERROR: node not found".
1092+
* Takes the convent of the html, removed everything except the pure text
1093+
* and then appends this text to case node search metadata field.
1094+
* In the end the temp files (pdf and html) are deleted.
10791095
*
1080-
* @param $file_id string id of the document file
1081-
* @param $case_nid the node id the case node, which metadata should be updated
1082-
* @param $path_to_pdf string path to the pdf version of the file
1083-
* @param $path_to_html string path to the html version of the file
1096+
* If HTML file is not found, the status will be changed to "downloaded"
1097+
* so that the text cron job will convert the file again.
1098+
* If case node is not found, the status will be chaned to
1099+
* "ERROR: node not found".
10841100
*
1085-
* @return none
1101+
* @param string $file_id
1102+
* Id of the document file
1103+
* @param int $case_nid
1104+
* The node id the case node, which metadata should be updated
1105+
* @param string $path_to_pdf
1106+
* Path to the pdf version of the file
1107+
* @param string $path_to_html
1108+
* Path to the html version of the file
10861109
*/
1087-
function _os2web_cp_service_update_case_metadata($file_id, $case_nid, $path_to_pdf, $path_to_html){
1088-
if (!file_exists($path_to_html)){//if does not exist, send for reconverting. PDF existence will be checked on that step as well.
1110+
function _os2web_cp_service_update_case_metadata($file_id, $case_nid, $path_to_pdf, $path_to_html) {
1111+
// If does not exist, send for reconverting.
1112+
// PDF existence will be checked on that step as well.
1113+
if (!file_exists($path_to_html)) {
1114+
db_update('os2web_cp_service_documents_conversion')
1115+
->fields(array(
1116+
'filepath_html' => NULL,
1117+
'status' => 'downloaded',
1118+
))
1119+
->condition('file_id', $file_id)
1120+
->execute();
1121+
}
1122+
else {
1123+
$case_node = node_load($case_nid);
1124+
if (!$case_node) {
10891125
db_update('os2web_cp_service_documents_conversion')
10901126
->fields(array(
1091-
'filepath_html' => null,
1092-
'status' => 'downloaded',
1127+
'status' => 'ERROR: node not found',
10931128
))
10941129
->condition('file_id', $file_id)
10951130
->execute();
1096-
} else {
1097-
$case_node = node_load($case_nid);
1098-
if (!$case_node){
1099-
db_update('os2web_cp_service_documents_conversion')
1100-
->fields(array(
1101-
'status' => 'ERROR: node not found',
1102-
))
1103-
->condition('file_id', $file_id)
1104-
->execute();
1105-
} else {
1106-
$text = file_get_contents($path_to_html);
1107-
1108-
//html tags removing
1109-
$text = str_replace('<p>&nbsp;</p>', ' ', $text); //removing unneeded paragraphs
1110-
$text = preg_replace('#<style(.*?)>(.*?)</style>#is', ' ', $text);//removing style tags
1111-
$text = preg_replace('#<script(.*?)>(.*?)</script>#is', ' ', $text);//removing scripts tags
1112-
$text = str_replace("\r\n", " ", strip_tags($text));
1113-
$text = str_replace("\n\r", " ", $text);
1114-
$text = str_replace("\n", " ", $text);
1115-
$text = str_replace("\r", " ", $text);
1116-
if (isset($case_node->field_os2web_cp_service_searchmt[LANGUAGE_NONE][0]['value'])) {
1117-
$search_metadata = $case_node->field_os2web_cp_service_searchmt[LANGUAGE_NONE][0]['value'] . $text;
1118-
}
1119-
else{
1120-
$search_metadata = $text;
1121-
}
1122-
$case_node->field_os2web_cp_service_searchmt[LANGUAGE_NONE][0]['value'] = $search_metadata;
1123-
node_save($case_node);
1124-
1125-
db_update('os2web_cp_service_documents_conversion')
1126-
->fields(array(
1127-
'status' => 'done',
1128-
))
1129-
->condition('file_id', $file_id)
1130-
->execute();
1131-
1132-
//tmp files cleanup
1133-
unlink($path_to_html);
1134-
unlink($path_to_pdf);
1131+
}
1132+
else {
1133+
$text = file_get_contents($path_to_html);
1134+
1135+
// Html tags removing.
1136+
// Removing unneeded paragraphs.
1137+
$text = str_replace('<p>&nbsp;</p>', ' ', $text);
1138+
// Removing style tags.
1139+
$text = preg_replace('#<style(.*?)>(.*?)</style>#is', ' ', $text);
1140+
// Removing scripts tags.
1141+
$text = preg_replace('#<script(.*?)>(.*?)</script>#is', ' ', $text);
1142+
$text = str_replace("\r\n", " ", strip_tags($text));
1143+
$text = str_replace("\n\r", " ", $text);
1144+
$text = str_replace("\n", " ", $text);
1145+
$text = str_replace("\r", " ", $text);
1146+
if (isset($case_node->field_os2web_cp_service_searchmt[LANGUAGE_NONE][0]['value'])) {
1147+
$search_metadata = $case_node->field_os2web_cp_service_searchmt[LANGUAGE_NONE][0]['value'] . $text;
1148+
}
1149+
else {
1150+
$search_metadata = $text;
11351151
}
1152+
$case_node->field_os2web_cp_service_searchmt[LANGUAGE_NONE][0]['value'] = $search_metadata;
1153+
node_save($case_node);
1154+
1155+
db_update('os2web_cp_service_documents_conversion')
1156+
->fields(array(
1157+
'status' => 'done',
1158+
))
1159+
->condition('file_id', $file_id)
1160+
->execute();
1161+
1162+
// Tmp files cleanup.
1163+
unlink($path_to_html);
1164+
unlink($path_to_pdf);
11361165
}
1166+
}
11371167
}

0 commit comments

Comments
 (0)