|
| 1 | +function Get-OpenXML |
| 2 | +{ |
| 3 | + <# |
| 4 | + .SYNOPSIS |
| 5 | + Gets Open Office XML files (Excel, PowerPoint, and Word) |
| 6 | + .DESCRIPTION |
| 7 | + Gets Open Office XML files (Excel, PowerPoint, and Word) as a structured object. |
| 8 | +
|
| 9 | + The object contains the file path, parts, and relationships of the OpenXML document. |
| 10 | + |
| 11 | + This cmdlet can be used to read the contents of .docx, .pptx, .xps, .xlsx files |
| 12 | + (or any files that are readable with [`IO.Packaging.Package`](https://learn.microsoft.com/en-us/dotnet/api/system.io.packaging.package?wt.mc_id=MVP_321542)) |
| 13 | + .EXAMPLE |
| 14 | + # Get an OpenXML document |
| 15 | + Get-OpenXML -FilePath './Sample.docx' |
| 16 | + #> |
| 17 | + [CmdletBinding()] |
| 18 | + [Alias('OpenXML')] |
| 19 | + param( |
| 20 | + # The path to the OpenXML file to read |
| 21 | + [Parameter(ValueFromPipelineByPropertyName=$true)] |
| 22 | + [Alias('Fullname')] |
| 23 | + [string] |
| 24 | + $FilePath |
| 25 | + ) |
| 26 | + |
| 27 | + begin { |
| 28 | + # First lets declare a little helper function to get the content of a part |
| 29 | + filter getPartContent { |
| 30 | + $part = $_ |
| 31 | + $partStream = $part.GetStream() |
| 32 | + if (-not $partStream) { return } |
| 33 | + switch ($part.ContentType) { |
| 34 | + # If the content type looks like XML, read it as XML |
| 35 | + { $part.ContentType -match '[\./\+]xml' } { |
| 36 | + $streamReader = [IO.StreamReader]::new($partStream) |
| 37 | + $streamReader.ReadToEnd() -as [xml] |
| 38 | + $streamReader.Close() |
| 39 | + } |
| 40 | + # If the part looks like JSON, read it as JSON |
| 41 | + { $part.Uri -match '\.json$'} { |
| 42 | + $streamReader = [IO.StreamReader]::new($partStream) |
| 43 | + $jsonContent = $streamReader.ReadToEnd() |
| 44 | + $streamReader.Close() |
| 45 | + $jsonContent | ConvertFrom-Json |
| 46 | + } |
| 47 | + # Otherwise, read it as a memory stream and return the byte array |
| 48 | + default { |
| 49 | + $outputStream = [IO.MemoryStream]::new() |
| 50 | + $partStream.CopyTo($outputStream) |
| 51 | + $outputStream.Seek(0, 'Begin') |
| 52 | + $outputStream.ToArray() |
| 53 | + } |
| 54 | + } |
| 55 | + |
| 56 | + $partStream.Close() |
| 57 | + $partStream.Dispose() |
| 58 | + } |
| 59 | + } |
| 60 | + |
| 61 | + process { |
| 62 | + # Try to resolve the file path |
| 63 | + $resolvedPath = $ExecutionContext.SessionState.Path.GetUnresolvedProviderPathFromPSPath($FilePath) |
| 64 | + # If we could not resolve the path, exit |
| 65 | + if (-not $resolvedPath ) { return } |
| 66 | + |
| 67 | + foreach ($filePath in $resolvedPath) { |
| 68 | + # Get the file info and read the file as a byte stream. |
| 69 | + $fileInfo = $FilePath -as [IO.FileInfo] |
| 70 | + # By reading the file with Get-Content -AsByteStream, we avoid locking the file |
| 71 | + # (or the file being locked by another process) |
| 72 | + $packageBytes = Get-Content -Path $FilePath -AsByteStream -Raw |
| 73 | + |
| 74 | + # If there were no bytes, return |
| 75 | + if (-not $packageBytes) { return } |
| 76 | + |
| 77 | + # Create a memory stream from the byte array |
| 78 | + $memoryStream = [IO.MemoryStream]::new($packageBytes) |
| 79 | + # and open the package from the memory stream |
| 80 | + $filePackage = [IO.Packaging.Package]::Open($memoryStream, "Open", "Read") |
| 81 | + # If that did not work, return. |
| 82 | + if (-not $filePackage) { return } |
| 83 | + |
| 84 | + # Get the package relationships. |
| 85 | + # (these are important for key corner cases in OpenXML files) |
| 86 | + $packageRelationships = $filePackage.GetRelationships() |
| 87 | + $packageContent = [Ordered]@{} |
| 88 | + $packageParts = @($filePackage.GetParts()) |
| 89 | + |
| 90 | + # Now we will read each part in the package, and store it in an `[Ordered]` dictionary |
| 91 | + # Since this _might_ take a while (if you used a lot of PowerPoint images) we want to show a progress bar. |
| 92 | + |
| 93 | + # Prepare the progress bar |
| 94 | + $partCount = 0 |
| 95 | + $partTotal = $packageParts.Length |
| 96 | + $partProgress = [Ordered]@{Id=Get-Random;Activity='Reading Parts'} |
| 97 | + |
| 98 | + # Then read each part |
| 99 | + foreach ($part in $packageParts) { |
| 100 | + $partCount++ |
| 101 | + # update the progress bar |
| 102 | + Write-Progress @partProgress -Status "Reading part $($part.Uri) ($partCount of $partTotal)" -PercentComplete ( |
| 103 | + [math]::Round(($partCount * 100/ $partTotal)) |
| 104 | + ) |
| 105 | + # and store the part in the dictionary |
| 106 | + $packageContent["$($part.Uri)"] = |
| 107 | + [PSCustomObject]@{ |
| 108 | + PSTypeName = 'OpenXML.Part' |
| 109 | + Uri = $part.Uri |
| 110 | + ContentType = $part.ContentType |
| 111 | + # (we'll use our helper function to get the content) |
| 112 | + Content = $part | getPartContent |
| 113 | + FilePath = "$resolvedPath" |
| 114 | + } |
| 115 | + } |
| 116 | + # Now that we've read all parts, we can close the package |
| 117 | + $filePackage.Close() |
| 118 | + # and the memory stream, too. |
| 119 | + $memoryStream.Close() |
| 120 | + |
| 121 | + # and finally, complete the progress bar. |
| 122 | + Write-Progress @partProgress -Status "Completed reading $partCount parts" -Completed |
| 123 | + |
| 124 | + # Now we can create the final object. |
| 125 | + $OpenXMLObject = [PSCustomObject]@{ |
| 126 | + # It is a generic OpenXML file by default |
| 127 | + PSTypeName = 'OpenXML.File' |
| 128 | + # with a `.FilePath`, so we can re-read and update it. |
| 129 | + FilePath = $resolvedPath |
| 130 | + # all of the `.Parts` have been read. |
| 131 | + Parts = $packageContent |
| 132 | + # and the package relationships are included, too. |
| 133 | + Relationships = $packageRelationships |
| 134 | + } |
| 135 | + |
| 136 | + # Now we can get more specific about what type of OpenXML file this is. |
| 137 | + # By looking for certain key parts, we can determine if this is a PowerPoint, Excel, or Word file. |
| 138 | + # For example, if the package contains a part with `/ppt/` in the URI, |
| 139 | + if ($packageContent.Keys -match '/ppt/') { |
| 140 | + # it is an `OpenXML.PowerPoint.File` |
| 141 | + $openXmlObject.pstypenames.insert(0, 'OpenXML.PowerPoint.File') |
| 142 | + } |
| 143 | + |
| 144 | + # If the package contains a part with `/xl/` in the URI, |
| 145 | + if ($packageContent.Keys -match '/xl/') { |
| 146 | + # it is an `OpenXML.Excel.File` |
| 147 | + $openXmlObject.pstypenames.insert(0, 'OpenXML.Excel.File') |
| 148 | + } |
| 149 | + |
| 150 | + # If the package contains a part with `/word/` in the URI, it is a Word file. |
| 151 | + if ($packageContent.Keys -match '/word/') { |
| 152 | + # it is an `OpenXML.Word.File` |
| 153 | + $openXmlObject.pstypenames.insert(0, 'OpenXML.Word.File') |
| 154 | + } |
| 155 | + |
| 156 | + |
| 157 | + # Now we output our openXML object |
| 158 | + $OpenXMLObject |
| 159 | + } |
| 160 | + } |
| 161 | +} |
0 commit comments