Skip to content

Commit 400cb9a

Browse files
committed
chore(xml): get detailed element information with getParagraphs / getParagraphGroups (WIP)
1 parent bcb5277 commit 400cb9a

File tree

2 files changed

+208
-11
lines changed

2 files changed

+208
-11
lines changed

src/helper/xml-slide-helper.ts

Lines changed: 183 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,25 @@
11
import {
22
ElementInfo,
33
ElementType,
4+
TextParagraph,
5+
TextParagraphGroup,
46
XmlDocument,
57
XmlElement,
68
} from '../types/xml-types';
79
import { XmlHelper } from './xml-helper';
810
import HasShapes from '../classes/has-shapes';
9-
import { FindElementSelector, ShapeModificationCallback } from '../types/types';
10-
import ModifyTableHelper from './modify-table-helper';
11-
import { TableData, TableInfo } from '../types/table-types';
11+
import { TableInfo } from '../types/table-types';
1212

1313
export const nsMain =
1414
'http://schemas.openxmlformats.org/presentationml/2006/main';
1515
export const mapUriType = {
1616
'http://schemas.openxmlformats.org/drawingml/2006/table': 'table',
1717
'http://schemas.openxmlformats.org/drawingml/2006/chart': 'chart',
1818
'http://schemas.microsoft.com/office/drawing/2014/chartex': 'chartEx',
19-
'http://schemas.openxmlformats.org/officeDocument/2006/relationships/oleObject': 'oleObject',
20-
'http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink': 'hyperlink',
19+
'http://schemas.openxmlformats.org/officeDocument/2006/relationships/oleObject':
20+
'oleObject',
21+
'http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink':
22+
'hyperlink',
2123
};
2224

2325
/**
@@ -118,6 +120,9 @@ export class XmlSlideHelper {
118120
hasTextBody: !!XmlSlideHelper.getTextBody(slideElement),
119121
getXmlElement: () => slideElement,
120122
getText: () => XmlSlideHelper.parseTextFragments(slideElement),
123+
getParagraphs: () => XmlSlideHelper.parseTextParagraphs(slideElement),
124+
getParagraphGroups: () =>
125+
XmlSlideHelper.parseParagraphGroups(slideElement),
121126
getTableInfo: () => XmlSlideHelper.readTableInfo(slideElement),
122127
getAltText: () => XmlSlideHelper.getImageAltText(slideElement),
123128
};
@@ -152,13 +157,170 @@ export class XmlSlideHelper {
152157
static parseTextFragments(shapeNode: XmlElement): string[] {
153158
const txBody = XmlSlideHelper.getTextBody(shapeNode);
154159
const textFragments: string[] = [];
160+
161+
if (!txBody) {
162+
return textFragments;
163+
}
164+
155165
const texts = txBody.getElementsByTagName('a:t');
156166
for (let t = 0; t < texts.length; t++) {
157-
textFragments.push(texts.item(t).textContent);
167+
const text = texts.item(t);
168+
textFragments.push(text.textContent);
158169
}
159170
return textFragments;
160171
}
161172

173+
static parseParagraphGroups(shapeNode: XmlElement): TextParagraphGroup[] {
174+
const rawParagraphs = XmlSlideHelper.parseTextParagraphs(shapeNode);
175+
return XmlSlideHelper.groupSimilarParagraphs(rawParagraphs);
176+
}
177+
178+
static parseTextParagraphs(shapeNode: XmlElement): TextParagraph[] {
179+
const textParagraphs: TextParagraph[] = [];
180+
181+
// Find txBody element first
182+
const txBody =
183+
shapeNode.getElementsByTagName('p:txBody')[0] ||
184+
shapeNode.getElementsByTagName('a:txBody')[0];
185+
186+
if (!txBody) return textParagraphs;
187+
188+
// Get all paragraph elements
189+
const paragraphs = txBody.getElementsByTagName('a:p');
190+
191+
for (const p of Array.from(paragraphs)) {
192+
const paragraph: TextParagraph = { texts: [] };
193+
194+
// Check for paragraph properties (indent and bullet)
195+
const pPr = p.getElementsByTagName('a:pPr')[0];
196+
197+
if (pPr) {
198+
XmlSlideHelper.setParagraphProperties(pPr, paragraph)
199+
}
200+
201+
// Get all text runs in the paragraph
202+
const runs = p.getElementsByTagName('a:r');
203+
const texts: string[] = [];
204+
205+
for (const run of Array.from(runs)) {
206+
XmlSlideHelper.setTextProperties(run, paragraph)
207+
208+
// Get text content
209+
const textElements = run.getElementsByTagName('a:t');
210+
for (const textElement of Array.from(textElements)) {
211+
texts.push(textElement.textContent || '');
212+
}
213+
}
214+
215+
// Only add paragraphs that have text content
216+
if (texts.length > 0) {
217+
paragraph.texts = texts;
218+
textParagraphs.push(paragraph);
219+
}
220+
}
221+
222+
return textParagraphs;
223+
}
224+
225+
static setTextProperties(run: XmlElement, paragraph: TextParagraph) {
226+
const rPr = run.getElementsByTagName('a:rPr')[0];
227+
if (rPr) {
228+
const isBold = rPr.getAttribute('b') === '1';
229+
const isUnderlined = rPr.getAttribute('u') === '1';
230+
const isItalic = rPr.getAttribute('i') === '1';
231+
const fontSize = parseInt(rPr.getAttribute('sz') || '0') / 100; // Convert to points
232+
233+
if (isBold) paragraph.isBold = true;
234+
if (isItalic) paragraph.isItalic = true;
235+
if (isUnderlined) paragraph.isUnderlined = true;
236+
if (fontSize) paragraph.fontSize = fontSize;
237+
}
238+
}
239+
240+
static setParagraphProperties(pPr: XmlElement, paragraph: TextParagraph) {
241+
const marL = pPr.getAttribute('marL');
242+
if (marL) {
243+
paragraph.indent = parseInt(marL);
244+
}
245+
246+
const buChar = pPr.getElementsByTagName('a:buChar')[0];
247+
if (buChar) {
248+
paragraph.bullet = buChar.getAttribute('char');
249+
}
250+
251+
// Check for numbered list
252+
const buAutoNum = pPr.getElementsByTagName('a:buAutoNum')[0];
253+
if (buAutoNum) {
254+
paragraph.isNumbered = true;
255+
paragraph.numberingType = buAutoNum.getAttribute('type') || undefined;
256+
paragraph.startAt = buAutoNum.getAttribute('startAt') || undefined;
257+
}
258+
259+
// Check for alignment
260+
const algn = pPr.getAttribute('algn');
261+
if (algn) {
262+
paragraph.align = algn as TextParagraph['align'];
263+
}
264+
}
265+
266+
static groupSimilarParagraphs(
267+
paragraphs: TextParagraph[],
268+
): TextParagraphGroup[] {
269+
const groups: TextParagraphGroup[] = [];
270+
let currentGroup: TextParagraphGroup | null = null;
271+
272+
const getDefinedProperties = (paragraph: TextParagraph) => {
273+
const properties: Record<string, any> = {};
274+
275+
const propertyKeys = [
276+
'fontSize',
277+
'isBold',
278+
'isItalic',
279+
'isUnderlined',
280+
// 'indent',
281+
'align',
282+
'isNumbered',
283+
'numberingType',
284+
'bullet',
285+
'startAt',
286+
] as const;
287+
288+
for (const key of propertyKeys) {
289+
if (paragraph[key] !== undefined) {
290+
properties[key] = paragraph[key];
291+
}
292+
}
293+
294+
return properties;
295+
};
296+
297+
for (const paragraph of paragraphs) {
298+
const properties = getDefinedProperties(paragraph);
299+
300+
// Helper function to check if properties match
301+
const propertiesMatch = (a: any, b: any): boolean => {
302+
return JSON.stringify(a) === JSON.stringify(b);
303+
};
304+
305+
// If we have no current group or properties don't match, create new group
306+
if (
307+
!currentGroup ||
308+
!propertiesMatch(currentGroup.properties, properties)
309+
) {
310+
currentGroup = {
311+
properties,
312+
texts: [],
313+
};
314+
groups.push(currentGroup);
315+
}
316+
317+
// Add text to current group
318+
currentGroup.texts.push(paragraph.texts.join(''));
319+
}
320+
321+
return groups;
322+
}
323+
162324
static getNonVisibleProperties(shapeNode: XmlElement): XmlElement {
163325
return shapeNode.getElementsByTagNameNS(nsMain, 'cNvPr').item(0);
164326
}
@@ -197,20 +359,24 @@ export class XmlSlideHelper {
197359
static getElementType(slideElementParent: XmlElement): ElementType {
198360
let type = slideElementParent.localName;
199361

362+
const getUri = () => {
363+
const graphicData =
364+
slideElementParent.getElementsByTagName('a:graphicData')[0];
365+
return graphicData.getAttribute('uri');
366+
};
367+
200368
switch (type) {
201369
case 'graphicFrame':
202-
const graphicData =
203-
slideElementParent.getElementsByTagName('a:graphicData')[0];
204-
const uri = graphicData.getAttribute('uri');
205-
type = mapUriType[uri] ? mapUriType[uri] : type;
370+
type = mapUriType[getUri()] || type;
206371
break;
207372
case 'oleObj':
208373
type = 'OLEObject';
209374
break;
210375
}
211376

212377
// Check for hyperlinks
213-
const hasHyperlink = slideElementParent.getElementsByTagName('a:hlinkClick');
378+
const hasHyperlink =
379+
slideElementParent.getElementsByTagName('a:hlinkClick');
214380
if (hasHyperlink.length > 0) {
215381
type = 'Hyperlink';
216382
}
@@ -228,13 +394,15 @@ export class XmlSlideHelper {
228394
y: 0,
229395
cx: 0,
230396
cy: 0,
397+
rot: 0,
231398
};
232399

233400
if (!xFrms.item(0)) {
234401
return position;
235402
}
236403

237404
const xFrm = xFrms.item(0);
405+
238406
const Off = xFrm.getElementsByTagName('a:off').item(0);
239407
const Ext = xFrm.getElementsByTagName('a:ext').item(0);
240408

@@ -243,6 +411,10 @@ export class XmlSlideHelper {
243411
position.cx = XmlSlideHelper.parseCoordinate(Ext, 'cx');
244412
position.cy = XmlSlideHelper.parseCoordinate(Ext, 'cy');
245413

414+
if(xFrm.getAttribute('rot')) {
415+
position.rot = parseInt(xFrm.getAttribute('rot'));
416+
}
417+
246418
return position;
247419
}
248420

src/types/xml-types.ts

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,14 +72,39 @@ export type ElementInfo = {
7272
y: number;
7373
cx: number;
7474
cy: number;
75+
rot?: number;
7576
};
7677
hasTextBody: boolean;
7778
getText: () => string[];
79+
getParagraphs: () => TextParagraph[];
80+
getParagraphGroups: () => TextParagraphGroup[];
7881
getAltText: () => string;
7982
getTableInfo: () => TableInfo[];
8083
getXmlElement: () => XmlElement;
8184
};
8285

86+
export type TextParagraphProps = {
87+
isBold?: boolean;
88+
isItalic?: boolean;
89+
isUnderlined?: boolean;
90+
fontSize?: number;
91+
align?: 'l' | 'ctr' | 'r';
92+
indent?: number;
93+
bullet?: string;
94+
isNumbered?: boolean;
95+
numberingType?: string
96+
startAt?: string
97+
}
98+
99+
export type TextParagraph = {
100+
texts: string[];
101+
} & TextParagraphProps
102+
103+
export type TextParagraphGroup = {
104+
properties: TextParagraphProps;
105+
texts: string[];
106+
}
107+
83108
export type ContentMapType = 'slideMaster' | 'slideLayout';
84109
export type ContentMap = {
85110
type: ContentMapType;

0 commit comments

Comments
 (0)