Skip to content

Commit c52a7e1

Browse files
author
James Brundage
committed
feat: Get-OpenXML ( Fixes #2 )
1 parent 4bbb254 commit c52a7e1

File tree

1 file changed

+161
-0
lines changed

1 file changed

+161
-0
lines changed

Commands/Get-OpenXML.ps1

Lines changed: 161 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,161 @@
1+
function Get-OpenXML
2+
{
3+
<#
4+
.SYNOPSIS
5+
Gets Open Office XML files (Excel, PowerPoint, and Word)
6+
.DESCRIPTION
7+
Gets Open Office XML files (Excel, PowerPoint, and Word) as a structured object.
8+
9+
The object contains the file path, parts, and relationships of the OpenXML document.
10+
11+
This cmdlet can be used to read the contents of .docx, .pptx, .xps, .xlsx files
12+
(or any files that are readable with [`IO.Packaging.Package`](https://learn.microsoft.com/en-us/dotnet/api/system.io.packaging.package?wt.mc_id=MVP_321542))
13+
.EXAMPLE
14+
# Get an OpenXML document
15+
Get-OpenXML -FilePath './Sample.docx'
16+
#>
17+
[CmdletBinding()]
18+
[Alias('OpenXML')]
19+
param(
20+
# The path to the OpenXML file to read
21+
[Parameter(ValueFromPipelineByPropertyName=$true)]
22+
[Alias('Fullname')]
23+
[string]
24+
$FilePath
25+
)
26+
27+
begin {
28+
# First lets declare a little helper function to get the content of a part
29+
filter getPartContent {
30+
$part = $_
31+
$partStream = $part.GetStream()
32+
if (-not $partStream) { return }
33+
switch ($part.ContentType) {
34+
# If the content type looks like XML, read it as XML
35+
{ $part.ContentType -match '[\./\+]xml' } {
36+
$streamReader = [IO.StreamReader]::new($partStream)
37+
$streamReader.ReadToEnd() -as [xml]
38+
$streamReader.Close()
39+
}
40+
# If the part looks like JSON, read it as JSON
41+
{ $part.Uri -match '\.json$'} {
42+
$streamReader = [IO.StreamReader]::new($partStream)
43+
$jsonContent = $streamReader.ReadToEnd()
44+
$streamReader.Close()
45+
$jsonContent | ConvertFrom-Json
46+
}
47+
# Otherwise, read it as a memory stream and return the byte array
48+
default {
49+
$outputStream = [IO.MemoryStream]::new()
50+
$partStream.CopyTo($outputStream)
51+
$outputStream.Seek(0, 'Begin')
52+
$outputStream.ToArray()
53+
}
54+
}
55+
56+
$partStream.Close()
57+
$partStream.Dispose()
58+
}
59+
}
60+
61+
process {
62+
# Try to resolve the file path
63+
$resolvedPath = $ExecutionContext.SessionState.Path.GetUnresolvedProviderPathFromPSPath($FilePath)
64+
# If we could not resolve the path, exit
65+
if (-not $resolvedPath ) { return }
66+
67+
foreach ($filePath in $resolvedPath) {
68+
# Get the file info and read the file as a byte stream.
69+
$fileInfo = $FilePath -as [IO.FileInfo]
70+
# By reading the file with Get-Content -AsByteStream, we avoid locking the file
71+
# (or the file being locked by another process)
72+
$packageBytes = Get-Content -Path $FilePath -AsByteStream -Raw
73+
74+
# If there were no bytes, return
75+
if (-not $packageBytes) { return }
76+
77+
# Create a memory stream from the byte array
78+
$memoryStream = [IO.MemoryStream]::new($packageBytes)
79+
# and open the package from the memory stream
80+
$filePackage = [IO.Packaging.Package]::Open($memoryStream, "Open", "Read")
81+
# If that did not work, return.
82+
if (-not $filePackage) { return }
83+
84+
# Get the package relationships.
85+
# (these are important for key corner cases in OpenXML files)
86+
$packageRelationships = $filePackage.GetRelationships()
87+
$packageContent = [Ordered]@{}
88+
$packageParts = @($filePackage.GetParts())
89+
90+
# Now we will read each part in the package, and store it in an `[Ordered]` dictionary
91+
# Since this _might_ take a while (if you used a lot of PowerPoint images) we want to show a progress bar.
92+
93+
# Prepare the progress bar
94+
$partCount = 0
95+
$partTotal = $packageParts.Length
96+
$partProgress = [Ordered]@{Id=Get-Random;Activity='Reading Parts'}
97+
98+
# Then read each part
99+
foreach ($part in $packageParts) {
100+
$partCount++
101+
# update the progress bar
102+
Write-Progress @partProgress -Status "Reading part $($part.Uri) ($partCount of $partTotal)" -PercentComplete (
103+
[math]::Round(($partCount * 100/ $partTotal))
104+
)
105+
# and store the part in the dictionary
106+
$packageContent["$($part.Uri)"] =
107+
[PSCustomObject]@{
108+
PSTypeName = 'OpenXML.Part'
109+
Uri = $part.Uri
110+
ContentType = $part.ContentType
111+
# (we'll use our helper function to get the content)
112+
Content = $part | getPartContent
113+
FilePath = "$resolvedPath"
114+
}
115+
}
116+
# Now that we've read all parts, we can close the package
117+
$filePackage.Close()
118+
# and the memory stream, too.
119+
$memoryStream.Close()
120+
121+
# and finally, complete the progress bar.
122+
Write-Progress @partProgress -Status "Completed reading $partCount parts" -Completed
123+
124+
# Now we can create the final object.
125+
$OpenXMLObject = [PSCustomObject]@{
126+
# It is a generic OpenXML file by default
127+
PSTypeName = 'OpenXML.File'
128+
# with a `.FilePath`, so we can re-read and update it.
129+
FilePath = $resolvedPath
130+
# all of the `.Parts` have been read.
131+
Parts = $packageContent
132+
# and the package relationships are included, too.
133+
Relationships = $packageRelationships
134+
}
135+
136+
# Now we can get more specific about what type of OpenXML file this is.
137+
# By looking for certain key parts, we can determine if this is a PowerPoint, Excel, or Word file.
138+
# For example, if the package contains a part with `/ppt/` in the URI,
139+
if ($packageContent.Keys -match '/ppt/') {
140+
# it is an `OpenXML.PowerPoint.File`
141+
$openXmlObject.pstypenames.insert(0, 'OpenXML.PowerPoint.File')
142+
}
143+
144+
# If the package contains a part with `/xl/` in the URI,
145+
if ($packageContent.Keys -match '/xl/') {
146+
# it is an `OpenXML.Excel.File`
147+
$openXmlObject.pstypenames.insert(0, 'OpenXML.Excel.File')
148+
}
149+
150+
# If the package contains a part with `/word/` in the URI, it is a Word file.
151+
if ($packageContent.Keys -match '/word/') {
152+
# it is an `OpenXML.Word.File`
153+
$openXmlObject.pstypenames.insert(0, 'OpenXML.Word.File')
154+
}
155+
156+
157+
# Now we output our openXML object
158+
$OpenXMLObject
159+
}
160+
}
161+
}

0 commit comments

Comments
 (0)