Skip to content

Commit 6d0667d

Browse files
committed
#90 - Extract title and notes from PPTX files into a json meta data file
1 parent 01e13a5 commit 6d0667d

File tree

3 files changed

+125
-0
lines changed

3 files changed

+125
-0
lines changed

composer.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
"wittiws/htmlawed":"dev-master",
2323
"wittiws/phpquery":"dev-master",
2424
"wittiws/quipxml":"dev-master",
25+
"wittiws/splash":"dev-master",
2526
"zendframework/zend-mail":"2.7.*"
2627
},
2728
"autoload":{

src/Configuration/ConfigurationDefaults.php

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -146,6 +146,11 @@ public function __construct(&$settings) {
146146
'#engine' => 'Convert\\Unoconv',
147147
),
148148
),
149+
'pptx->json' => array(
150+
'nativemeta:default' => array(
151+
'#engine' => 'Convert\\NativeMeta',
152+
),
153+
),
149154
'rtf->pdf' => array(
150155
'unoconv:default' => array(
151156
'#engine' => 'Convert\\Unoconv',

src/Engine/Convert/NativeMeta.php

Lines changed: 119 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,119 @@
1+
<?php
2+
/*
3+
* This file is part of the FileConverter package.
4+
*
5+
* (c) Greg Payne
6+
*
7+
* For the full copyright and license information, please view the LICENSE
8+
* file that was distributed with this source code.
9+
*/
10+
11+
namespace FileConverter\Engine\Convert;
12+
use FileConverter\Engine\EngineBase;
13+
use Splash\Splash;
14+
use QuipXml\Quip;
15+
16+
class NativeMeta extends EngineBase {
17+
/**
18+
* @todo use Message::fromString($raw) to convert from eml to other formats
19+
*/
20+
public function convertFile($source, $destination) {
21+
$meta = array();
22+
23+
// Extract meta data based on the file type.
24+
switch ($this->conversion[0]) {
25+
case 'pptx':
26+
// Open the pptx file
27+
$pptx = new \ZipArchive;
28+
if (TRUE !== $pptx->open($source)) {
29+
throw new \ErrorException("Unable to open the PPTX file");
30+
}
31+
32+
// Build the list of files.
33+
$files = array();
34+
for ($i = 0; $i < $pptx->numFiles; $i++) {
35+
$files[] = $pptx->getNameIndex($i);
36+
}
37+
$files = Splash::fromArray($files);
38+
39+
// Build the slides.
40+
$meta['slides'] = array();
41+
foreach ($files->regex("@ppt/slides/slide\d+.xml$@") as $file) {
42+
$slide = array();
43+
$number = preg_replace('@^ppt/slides/slide(\d+)\.xml$@s', '\1', $file);
44+
$slide['number'] = $number;
45+
46+
// Detect the title.
47+
// ppt/slides/slide1.xml
48+
$xml_slide = Quip::load($pptx->getFromName($file));
49+
$title = $xml_slide->xpath("//p:sp//p:ph[@type='title' or @type='ctrTitle']")->xpath('../../..')->html();
50+
// http://www.datypic.com/sc/ooxml/e-a_br-1.html
51+
$title = preg_replace('@<a:br[^>]*>@s', "\n", $title);
52+
$title = trim(strip_tags($title));
53+
$slide['title'] = $title;
54+
55+
// Detect any notes.
56+
// ppt/slides/_rels/slide1.xml.rels
57+
// The 'Relationship' tag name does not work in this xpath for unknown reasons.
58+
$xml_rels = Quip::load($pptx->getFromName("ppt/slides/_rels/slide$number.xml.rels"));
59+
$note_id = $xml_rels->xpath("//*[@Type='http://schemas.openxmlformats.org/officeDocument/2006/relationships/notesSlide']")->eq(0)['Target'];
60+
61+
// Load the notes from the connected XML.
62+
// ../notesSlides/notesSlide1.xml
63+
// becomes ppt/notesSlides/notesSlide1.xml
64+
if (substr($note_id, 0, 3) === '../') {
65+
$note_id = preg_replace('@^\.\./@', 'ppt/', $note_id);
66+
$xml_note = Quip::load($pptx->getFromName($note_id));
67+
$note = $xml_note->html();
68+
// The a:p tag appears preferred within notes.
69+
$note = preg_replace('@<a:br[^>]*>@s', "\n", $note);
70+
$note = preg_replace('@<a:p(?:\s[^>]*)?>@s', "\n", $note);
71+
// The slide number appears in notes within a:fld.
72+
$note = preg_replace('@<a:fld.*?</a:fld>@s', "", $note);
73+
$note = trim(strip_tags($note));
74+
$slide['notes'] = $note;
75+
}
76+
77+
$meta['slides'][$number - 1] = $slide;
78+
}
79+
ksort($meta['slides']);
80+
81+
break;
82+
83+
default:
84+
throw new \InvalidArgumentException("Unsupported conversion source type requested");
85+
}
86+
87+
switch ($this->conversion[1]) {
88+
case 'json':
89+
$output = json_encode($meta, JSON_PRETTY_PRINT
90+
| JSON_PARTIAL_OUTPUT_ON_ERROR);
91+
file_put_contents($destination, $output);
92+
return $this;
93+
94+
default:
95+
throw new \InvalidArgumentException("Unsupported conversion destination type requested");
96+
}
97+
}
98+
99+
protected function getHelpInstallation($os, $os_version) {
100+
$help = array(
101+
'title' => 'Native Meta Data Extractor',
102+
);
103+
switch ($os) {
104+
case 'Ubuntu':
105+
$help['os'] = 'confirmed on Ubuntu 16.04';
106+
$help['notes'] = array(
107+
'composer update',
108+
);
109+
return $help;
110+
}
111+
112+
return parent::getHelpInstallation($os, $os_version);
113+
}
114+
115+
public function isAvailable() {
116+
return TRUE;
117+
}
118+
119+
}

0 commit comments

Comments
 (0)