6
6
*/
7
7
8
8
include_once 'os2web_cp_service.features.inc';
9
- define('DEFAULT_CASE_FILE_LIMIT', 50);//number of files to handle at once
9
+ define('DEFAULT_CASE_FILE_LIMIT', 50); // Number of files to handle at once.
10
10
11
11
/**
12
12
* Implements hook_init().
@@ -40,7 +40,7 @@ function os2web_cp_service_menu() {
40
40
'page arguments' => array(4),
41
41
'access callback' => TRUE,
42
42
);
43
- //to delete
43
+ // To manually run the cron job.
44
44
$items['os2web/cp_cron'] = array(
45
45
'type' => MENU_CALLBACK,
46
46
'page callback' => 'os2web_cp_service_cron',
@@ -554,7 +554,7 @@ function os2web_cp_service_create_document(array $data) {
554
554
}
555
555
if ($is_missing) {
556
556
$cnode->field_os2web_cp_service_doc_ref[LANGUAGE_NONE][]['target_id'] = $node->nid;
557
- //
adding for pdf 2 html conversion,
[email protected]
557
+ //
Adding file for pdf 2 html conversion,
[email protected] .
558
558
os2web_cp_service_schedule_document_pdf2html_conversion($data['fields']['Indhold - FileID'], $cnode->nid);
559
559
node_save($cnode);
560
560
}
@@ -929,12 +929,12 @@ function os2web_cp_service_get_extension_from_mime($mime) {
929
929
/**
930
930
* Schedules a document for pdf to html conversion by adding the document's to database table.
931
931
*
932
- * @param string $file_id id of the file on remove server
933
- * @param int $case_nid nid of the destination case, which metadata field should be updated
934
- *
935
- * @return none
932
+ * @param string $file_id
933
+ * Id of the file on remove server
934
+ * @param int $case_nid
935
+ * nid of the destination case, which metadata field should be updated
936
936
*/
937
- function os2web_cp_service_schedule_document_pdf2html_conversion($file_id, $case_nid){
937
+ function os2web_cp_service_schedule_document_pdf2html_conversion($file_id, $case_nid) {
938
938
db_insert('os2web_cp_service_documents_conversion')
939
939
->fields(array(
940
940
'file_id' => $file_id,
@@ -948,190 +948,220 @@ function os2web_cp_service_schedule_document_pdf2html_conversion($file_id, $case
948
948
949
949
/**
950
950
* Cron implementation.
951
+ *
951
952
* Goes through the enrties in database table,
952
953
* downloads the documents as pdf,
953
- * and converts the documents from pdf to html and updates the case metadata field with the document contents.
954
- *
955
- * @return none
954
+ * and converts the documents from pdf to html and updates the
955
+ * case metadata field with the document contents.
956
956
*/
957
- function os2web_cp_service_cron(){
958
- //download
959
- $query = db_select('os2web_cp_service_documents_conversion', 'dc');
960
- $query->fields('dc',array('file_id'))
957
+ function os2web_cp_service_cron() {
958
+ // Download.
959
+ $query = db_select('os2web_cp_service_documents_conversion', 'dc');
960
+ $query->fields('dc', array('file_id'))
961
961
->isNull('dc.status')
962
- ->range(0,DEFAULT_CASE_FILE_LIMIT);
963
- $result = $query->execute();
964
- while($record = $result->fetchAssoc()) {
965
- _os2web_cp_service_document_download ($record['file_id']);
966
- }
962
+ ->range(0, DEFAULT_CASE_FILE_LIMIT);
963
+ $result = $query->execute();
964
+ while ($record = $result->fetchAssoc()) {
965
+ os2web_cp_service_document_download ($record['file_id']);
966
+ }
967
967
968
- //convert
969
- $query = db_select('os2web_cp_service_documents_conversion', 'dc');
970
- $query->fields('dc',array('file_id', 'filepath_pdf'))
968
+ // Convert.
969
+ $query = db_select('os2web_cp_service_documents_conversion', 'dc');
970
+ $query->fields('dc', array('file_id', 'filepath_pdf'))
971
971
->condition('dc.status', 'downloaded')
972
- ->range(0,DEFAULT_CASE_FILE_LIMIT);
973
- $result = $query->execute();
974
- while($record = $result->fetchAssoc()) {
975
- _os2web_cp_service_document_convert($record['file_id'], $record['filepath_pdf']);
976
- }
972
+ ->range(0, DEFAULT_CASE_FILE_LIMIT);
973
+ $result = $query->execute();
974
+ while ($record = $result->fetchAssoc()) {
975
+ _os2web_cp_service_document_convert($record['file_id'], $record['filepath_pdf']);
976
+ }
977
977
978
- //field updating
979
- $query = db_select('os2web_cp_service_documents_conversion', 'dc');
980
- $query->fields('dc',array('file_id', 'case_nid', 'filepath_pdf', 'filepath_html'))
978
+ // Field updating.
979
+ $query = db_select('os2web_cp_service_documents_conversion', 'dc');
980
+ $query->fields('dc', array(
981
+ 'file_id',
982
+ 'case_nid',
983
+ 'filepath_pdf',
984
+ 'filepath_html')
985
+ )
981
986
->condition('dc.status', 'converted')
982
- ->range(0,DEFAULT_CASE_FILE_LIMIT);
983
- $result = $query->execute();
984
- while($record = $result->fetchAssoc()) {
985
- _os2web_cp_service_update_case_metadata($record['file_id'], $record['case_nid'], $record['filepath_pdf'], $record['filepath_html']);
986
- }
987
+ ->range(0, DEFAULT_CASE_FILE_LIMIT);
988
+ $result = $query->execute();
989
+ while ($record = $result->fetchAssoc()) {
990
+ _os2web_cp_service_update_case_metadata($record['file_id'], $record['case_nid'], $record['filepath_pdf'], $record['filepath_html']);
991
+ }
987
992
}
988
993
989
994
/**
990
- * Download the document file and places it into the temporary location
991
- * also updates the database entry with the created filepath
995
+ * Download the document file and places it into the temporary location.
992
996
*
993
- * If file is not found on the remote server, the status is changed to "ERROR: 404 not found"
994
- * If ULR returns anything else than 200 http code, the status is chaned to "ERROR: {httpCode}"
997
+ * Also updates the database entry with the created filepath.
998
+ * If file is not found on the remote server,
999
+ * the status is changed to "ERROR: 404 not found"
995
1000
*
996
- * @param $file_id string id of the document file
1001
+ * If ULR returns anything else than 200 http code,
1002
+ * the status is chaned to "ERROR: {http_code}"
997
1003
*
998
- * @return none
1004
+ * @param string $file_id
1005
+ * id of the document file
999
1006
*/
1000
- function _os2web_cp_service_document_download($file_id){
1001
- $url = $GLOBALS['base_url'] . '/os2web/service/gf/v1/' . $file_id;//address of remote file
1002
- $tmpfname = tempnam(file_directory_temp(), "os2web_cp_document_");//path to where the file will be downloaded
1007
+ function _os2web_cp_service_document_download($file_id) {
1008
+ // Address of remote file.
1009
+ $url = $GLOBALS['base_url'] . '/os2web/service/gf/v1/' . $file_id;
1010
+ // Path to where the file will be downloaded.
1011
+ $tmpfname = tempnam(file_directory_temp(), "os2web_cp_document_");
1003
1012
1004
1013
$fp = fopen($tmpfname, 'w');
1005
1014
1006
1015
$ch = curl_init($url);
1007
1016
curl_setopt($ch, CURLOPT_FILE, $fp);
1008
1017
$data= curl_exec($ch);
1009
1018
1010
- //Check for 404 (file not found)
1011
- $httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
1019
+ // Check for 404 (file not found).
1020
+ $http_code = curl_getinfo($ch, CURLINFO_HTTP_CODE);
1012
1021
curl_close($ch);
1013
1022
fclose($fp);
1014
1023
1015
- if ($httpCode == 404) {
1024
+ if ($http_code == 404) {
1016
1025
print_r('404');
1017
1026
db_update('os2web_cp_service_documents_conversion')
1018
1027
->fields(array(
1019
- 'status' => 'ERROR: 404 not found',
1028
+ 'status' => 'ERROR: 404 not found',
1020
1029
))
1021
1030
->condition('file_id', $file_id)
1022
1031
->execute();
1023
- } else if ($httpCode == 200) {
1032
+ }
1033
+ elseif ($http_code == 200) {
1024
1034
db_update('os2web_cp_service_documents_conversion')
1025
- ->fields(array(
1026
- 'filepath_pdf' => $tmpfname,
1027
- 'status' => 'downloaded',
1028
- ))
1035
+ ->fields(array(
1036
+ 'filepath_pdf' => $tmpfname,
1037
+ 'status' => 'downloaded',
1038
+ ))
1029
1039
->condition('file_id', $file_id)
1030
1040
->execute();
1031
- } else {//unknown error
1041
+ }
1042
+ // Unknown error.
1043
+ else {
1032
1044
db_update('os2web_cp_service_documents_conversion')
1033
- ->fields(array(
1034
- 'status' => 'ERROR: ' . $httpCode ,
1035
- ))
1045
+ ->fields(array(
1046
+ 'status' => 'ERROR: ' . $http_code ,
1047
+ ))
1036
1048
->condition('file_id', $file_id)
1037
1049
->execute();
1038
1050
}
1039
1051
}
1040
1052
1041
1053
/**
1042
- * Converts the downloaded pdf into HTML
1043
- * also updates the database entry with the created filepath
1054
+ * Converts the downloaded pdf into HTML.
1044
1055
*
1045
- * If PDF file is not found, the status is changes to NULL, so that the next cron job will download the file again.
1056
+ * Also updates the database entry with the created filepath.
1057
+ * If PDF file is not found, the status is changes to NULL,
1058
+ * so that the next cron job will download the file again.
1046
1059
*
1047
- * @param $file_id string id of the document file
1048
- * @param $path_to_pdf string path to the pdf version of the file
1060
+ * @param string $file_id
1061
+ * id of the document file
1049
1062
*
1050
- * @return none
1063
+ * @param string $path_to_pdf
1064
+ * path to the pdf version of the file
1051
1065
*/
1052
- function _os2web_cp_service_document_convert($file_id, $path_to_pdf){
1053
- if (!file_exists($path_to_pdf)){//if does not exist, send for redownloading
1054
- db_update('os2web_cp_service_documents_conversion')
1055
- ->fields(array(
1056
- 'filepath_pdf' => null,
1057
- 'status' => null,
1058
- ))
1059
- ->condition('file_id', $file_id)
1060
- ->execute();
1061
- } else {
1062
- shell_exec('pdf2htmlEX ' . $path_to_pdf . ' --dest-dir ' . file_directory_temp());
1063
- db_update('os2web_cp_service_documents_conversion')
1064
- ->fields(array(
1065
- 'filepath_html' => $path_to_pdf . '.html',
1066
- 'status' => 'converted',
1067
- ))
1066
+ function _os2web_cp_service_document_convert($file_id, $path_to_pdf) {
1067
+ // If does not exist, send for redownloading.
1068
+ if (!file_exists($path_to_pdf)) {
1069
+ db_update('os2web_cp_service_documents_conversion')
1070
+ ->fields(array(
1071
+ 'filepath_pdf' => NULL,
1072
+ 'status' => NULL,
1073
+ ))
1068
1074
->condition('file_id', $file_id)
1069
1075
->execute();
1070
- }
1076
+ }
1077
+ else {
1078
+ shell_exec('pdf2htmlEX ' . $path_to_pdf . ' --dest-dir ' . file_directory_temp());
1079
+ db_update('os2web_cp_service_documents_conversion')
1080
+ ->fields(array(
1081
+ 'filepath_html' => $path_to_pdf . '.html',
1082
+ 'status' => 'converted',
1083
+ ))
1084
+ ->condition('file_id', $file_id)
1085
+ ->execute();
1086
+ }
1071
1087
}
1072
1088
1073
1089
/**
1074
- * Takes the convent of the html, removed everything except the pure text and then appends this text to case node search metadata field.
1075
- * In the end the temp files (pdf and html) are deleted.
1090
+ * Update field in case node.
1076
1091
*
1077
- * If HTML file is not found, the status will be changed to "downloaded" so that the text cron job will convert the file again.
1078
- * If case node is not found, the status will be chaned to "ERROR: node not found".
1092
+ * Takes the convent of the html, removed everything except the pure text
1093
+ * and then appends this text to case node search metadata field.
1094
+ * In the end the temp files (pdf and html) are deleted.
1079
1095
*
1080
- * @param $file_id string id of the document file
1081
- * @param $case_nid the node id the case node, which metadata should be updated
1082
- * @param $path_to_pdf string path to the pdf version of the file
1083
- * @param $path_to_html string path to the html version of the file
1096
+ * If HTML file is not found, the status will be changed to "downloaded"
1097
+ * so that the text cron job will convert the file again.
1098
+ * If case node is not found, the status will be chaned to
1099
+ * "ERROR: node not found".
1084
1100
*
1085
- * @return none
1101
+ * @param string $file_id
1102
+ * Id of the document file
1103
+ * @param int $case_nid
1104
+ * The node id the case node, which metadata should be updated
1105
+ * @param string $path_to_pdf
1106
+ * Path to the pdf version of the file
1107
+ * @param string $path_to_html
1108
+ * Path to the html version of the file
1086
1109
*/
1087
- function _os2web_cp_service_update_case_metadata($file_id, $case_nid, $path_to_pdf, $path_to_html){
1088
- if (!file_exists($path_to_html)){//if does not exist, send for reconverting. PDF existence will be checked on that step as well.
1110
+ function _os2web_cp_service_update_case_metadata($file_id, $case_nid, $path_to_pdf, $path_to_html) {
1111
+ // If does not exist, send for reconverting.
1112
+ // PDF existence will be checked on that step as well.
1113
+ if (!file_exists($path_to_html)) {
1114
+ db_update('os2web_cp_service_documents_conversion')
1115
+ ->fields(array(
1116
+ 'filepath_html' => NULL,
1117
+ 'status' => 'downloaded',
1118
+ ))
1119
+ ->condition('file_id', $file_id)
1120
+ ->execute();
1121
+ }
1122
+ else {
1123
+ $case_node = node_load($case_nid);
1124
+ if (!$case_node) {
1089
1125
db_update('os2web_cp_service_documents_conversion')
1090
1126
->fields(array(
1091
- 'filepath_html' => null,
1092
- 'status' => 'downloaded',
1127
+ 'status' => 'ERROR: node not found',
1093
1128
))
1094
1129
->condition('file_id', $file_id)
1095
1130
->execute();
1096
- } else {
1097
- $case_node = node_load($case_nid);
1098
- if (!$case_node){
1099
- db_update('os2web_cp_service_documents_conversion')
1100
- ->fields(array(
1101
- 'status' => 'ERROR: node not found',
1102
- ))
1103
- ->condition('file_id', $file_id)
1104
- ->execute();
1105
- } else {
1106
- $text = file_get_contents($path_to_html);
1107
-
1108
- //html tags removing
1109
- $text = str_replace('<p> </p>', ' ', $text); //removing unneeded paragraphs
1110
- $text = preg_replace('#<style(.*?)>(.*?)</style>#is', ' ', $text);//removing style tags
1111
- $text = preg_replace('#<script(.*?)>(.*?)</script>#is', ' ', $text);//removing scripts tags
1112
- $text = str_replace("\r\n", " ", strip_tags($text));
1113
- $text = str_replace("\n\r", " ", $text);
1114
- $text = str_replace("\n", " ", $text);
1115
- $text = str_replace("\r", " ", $text);
1116
- if (isset($case_node->field_os2web_cp_service_searchmt[LANGUAGE_NONE][0]['value'])) {
1117
- $search_metadata = $case_node->field_os2web_cp_service_searchmt[LANGUAGE_NONE][0]['value'] . $text;
1118
- }
1119
- else{
1120
- $search_metadata = $text;
1121
- }
1122
- $case_node->field_os2web_cp_service_searchmt[LANGUAGE_NONE][0]['value'] = $search_metadata;
1123
- node_save($case_node);
1124
-
1125
- db_update('os2web_cp_service_documents_conversion')
1126
- ->fields(array(
1127
- 'status' => 'done',
1128
- ))
1129
- ->condition('file_id', $file_id)
1130
- ->execute();
1131
-
1132
- //tmp files cleanup
1133
- unlink($path_to_html);
1134
- unlink($path_to_pdf);
1131
+ }
1132
+ else {
1133
+ $text = file_get_contents($path_to_html);
1134
+
1135
+ // Html tags removing.
1136
+ // Removing unneeded paragraphs.
1137
+ $text = str_replace('<p> </p>', ' ', $text);
1138
+ // Removing style tags.
1139
+ $text = preg_replace('#<style(.*?)>(.*?)</style>#is', ' ', $text);
1140
+ // Removing scripts tags.
1141
+ $text = preg_replace('#<script(.*?)>(.*?)</script>#is', ' ', $text);
1142
+ $text = str_replace("\r\n", " ", strip_tags($text));
1143
+ $text = str_replace("\n\r", " ", $text);
1144
+ $text = str_replace("\n", " ", $text);
1145
+ $text = str_replace("\r", " ", $text);
1146
+ if (isset($case_node->field_os2web_cp_service_searchmt[LANGUAGE_NONE][0]['value'])) {
1147
+ $search_metadata = $case_node->field_os2web_cp_service_searchmt[LANGUAGE_NONE][0]['value'] . $text;
1148
+ }
1149
+ else {
1150
+ $search_metadata = $text;
1135
1151
}
1152
+ $case_node->field_os2web_cp_service_searchmt[LANGUAGE_NONE][0]['value'] = $search_metadata;
1153
+ node_save($case_node);
1154
+
1155
+ db_update('os2web_cp_service_documents_conversion')
1156
+ ->fields(array(
1157
+ 'status' => 'done',
1158
+ ))
1159
+ ->condition('file_id', $file_id)
1160
+ ->execute();
1161
+
1162
+ // Tmp files cleanup.
1163
+ unlink($path_to_html);
1164
+ unlink($path_to_pdf);
1136
1165
}
1166
+ }
1137
1167
}
0 commit comments