|
1 | | -<# |
2 | | -.SYNOPSIS |
3 | | -Classify text files by encoding under the current subtree, respecting .gitignore. |
4 | | -
|
5 | | -.DESCRIPTION |
6 | | -Enumerates tracked files and untracked-but-not-ignored files (via Git) beneath |
7 | | -PWD. Skips likely-binary files (NUL probe). Classifies remaining files as: |
8 | | - - 'utf8' : valid UTF-8 (no BOM) or empty file |
9 | | - - 'utf8-with-bom' : starts with UTF-8 BOM (EF BB BF) |
10 | | - - 'other' : text but not valid UTF-8 (e.g., UTF-16/ANSI) |
11 | | -
|
12 | | -Outputs: |
13 | | - 1) Relative paths of files classified as 'other' |
14 | | - 2) A table by extension: UTF8 / UTF8-with-BOM / Other / Total |
15 | | -
|
16 | | -Notes: |
17 | | - - Read-only: this script makes no changes. |
18 | | - - Requires Git and must be run inside a Git work tree. |
19 | | -#> |
20 | | - |
21 | | -[CmdletBinding()] |
22 | | -param() |
23 | | - |
24 | | -Set-StrictMode -Version Latest |
25 | | -$ErrorActionPreference = 'Stop' |
26 | | - |
27 | | -# --- Git enumeration --------------------------------------------------------- |
28 | | -function Assert-InGitWorkTree { |
29 | | - # Throws if not inside a Git work tree. |
30 | | - $inside = (& git rev-parse --is-inside-work-tree 2>$null).Trim() |
31 | | - if ($LASTEXITCODE -ne 0 -or $inside -ne 'true') { |
32 | | - throw 'Not in a Git work tree.' |
33 | | - } |
34 | | -} |
35 | | - |
36 | | -function Get-GitFilesUnderPwd { |
37 | | - <# |
38 | | - Returns full paths to tracked + untracked-not-ignored files under PWD. |
39 | | - #> |
40 | | - Assert-InGitWorkTree |
41 | | - |
42 | | - $repoRoot = (& git rev-parse --show-toplevel).Trim() |
43 | | - $pwdPath = (Get-Location).Path |
44 | | - |
45 | | - # cached (tracked) + others (untracked not ignored) |
46 | | - $nulSeparated = & git -C $repoRoot ls-files -z --cached --others --exclude-standard |
47 | | - |
48 | | - $relativePaths = $nulSeparated.Split( |
49 | | - [char]0, [System.StringSplitOptions]::RemoveEmptyEntries) |
50 | | - |
51 | | - foreach ($relPath in $relativePaths) { |
52 | | - $fullPath = Join-Path $repoRoot $relPath |
53 | | - |
54 | | - # Only include files under the current subtree. |
55 | | - if ($fullPath.StartsWith($pwdPath, |
56 | | - [System.StringComparison]::OrdinalIgnoreCase)) { |
57 | | - if (Test-Path -LiteralPath $fullPath -PathType Leaf) { $fullPath } |
58 | | - } |
59 | | - } |
60 | | -} |
61 | | - |
62 | | -# --- Probes ------------------------------------------------------------------ |
63 | | -function Test-ProbablyBinary { |
64 | | - # Heuristic: treat as binary if the first 8 KiB contains any NUL byte. |
65 | | - param([Parameter(Mandatory)][string]$Path) |
66 | | - |
67 | | - try { |
68 | | - $stream = [System.IO.File]::Open($Path,'Open','Read','ReadWrite') |
69 | | - try { |
70 | | - $len = [int][Math]::Min(8192,$stream.Length) |
71 | | - if ($len -le 0) { return $false } |
72 | | - |
73 | | - $buffer = [byte[]]::new($len) |
74 | | - [void]$stream.Read($buffer,0,$len) |
75 | | - return ($buffer -contains 0) |
76 | | - } |
77 | | - finally { $stream.Dispose() } |
78 | | - } |
79 | | - catch { return $false } |
80 | | -} |
81 | | - |
82 | | -function Get-TextEncodingCategory { |
83 | | - # Returns 'utf8', 'utf8-with-bom', 'other', or $null for likely-binary. |
84 | | - param([Parameter(Mandatory)][string]$Path) |
85 | | - |
86 | | - $stream = [System.IO.File]::Open($Path,'Open','Read','ReadWrite') |
87 | | - try { |
88 | | - $fileLength = $stream.Length |
89 | | - if ($fileLength -eq 0) { return 'utf8' } |
90 | | - |
91 | | - # BOM check (EF BB BF) |
92 | | - $header = [byte[]]::new([Math]::Min(3,$fileLength)) |
93 | | - [void]$stream.Read($header,0,$header.Length) |
94 | | - if ($header.Length -ge 3 -and |
95 | | - $header[0] -eq 0xEF -and $header[1] -eq 0xBB -and $header[2] -eq 0xBF) { |
96 | | - return 'utf8-with-bom' |
97 | | - } |
98 | | - |
99 | | - # Quick binary probe before expensive decoding |
100 | | - $stream.Position = 0 |
101 | | - $sampleLen = [int][Math]::Min(8192,$fileLength) |
102 | | - $sample = [byte[]]::new($sampleLen) |
103 | | - [void]$stream.Read($sample,0,$sampleLen) |
104 | | - if ($sample -contains 0) { return $null } |
105 | | - } |
106 | | - finally { $stream.Dispose() } |
107 | | - |
108 | | - # Validate UTF-8 by decoding with throw-on-invalid option (no BOM). |
109 | | - try { |
110 | | - $bytes = [System.IO.File]::ReadAllBytes($Path) |
111 | | - $utf8 = [System.Text.UTF8Encoding]::new($false,$true) |
112 | | - [void]$utf8.GetString($bytes) |
113 | | - return 'utf8' |
114 | | - } |
115 | | - catch { return 'other' } |
116 | | -} |
117 | | - |
118 | | -# --- Main -------------------------------------------------------------------- |
119 | | -$otherFiles = @() |
120 | | -$byExtension = @{} |
121 | | - |
122 | | -$allFiles = Get-GitFilesUnderPwd |
123 | | - |
124 | | -foreach ($fullPath in $allFiles) { |
125 | | - # Avoid decoding likely-binary files. |
126 | | - if (Test-ProbablyBinary $fullPath) { continue } |
127 | | - |
128 | | - $category = Get-TextEncodingCategory $fullPath |
129 | | - if (-not $category) { continue } |
130 | | - |
131 | | - $ext = [IO.Path]::GetExtension($fullPath).ToLower() |
132 | | - if (-not $byExtension.ContainsKey($ext)) { |
133 | | - $byExtension[$ext] = @{ 'utf8' = 0; 'utf8-with-bom' = 0; 'other' = 0 } |
134 | | - } |
135 | | - |
136 | | - $byExtension[$ext][$category]++ |
137 | | - |
138 | | - if ($category -eq 'other') { |
139 | | - $otherFiles += (Resolve-Path -LiteralPath $fullPath -Relative) |
140 | | - } |
141 | | -} |
142 | | - |
143 | | -# 1) Files in 'other' |
144 | | -if ($otherFiles.Count -gt 0) { |
145 | | - 'Files classified as ''other'':' |
146 | | - $otherFiles | Sort-Object | ForEach-Object { " $_" } |
147 | | - '' |
148 | | -} |
149 | | - |
150 | | -# 2) Table by extension |
151 | | -$rows = foreach ($kv in $byExtension.GetEnumerator()) { |
152 | | - $ext = if ($kv.Key) { $kv.Key } else { '[noext]' } |
153 | | - $u = [int]$kv.Value['utf8'] |
154 | | - $b = [int]$kv.Value['utf8-with-bom'] |
155 | | - $o = [int]$kv.Value['other'] |
156 | | - |
157 | | - [PSCustomObject]@{ |
158 | | - Extension = $ext |
159 | | - UTF8 = $u |
160 | | - 'UTF8-with-BOM' = $b |
161 | | - Other = $o |
162 | | - Total = $u + $b + $o |
163 | | - } |
164 | | -} |
165 | | - |
166 | | -$rows | |
167 | | - Sort-Object -Property ( |
168 | | - @{Expression='Total';Descending=$true}, |
169 | | - @{Expression='Extension';Descending=$false} |
170 | | - ) | |
171 | | - Format-Table -AutoSize |
| 1 | +<# |
| 2 | +.SYNOPSIS |
| 3 | +Classify text files by encoding under the current subtree, respecting .gitignore. |
| 4 | +
|
| 5 | +.DESCRIPTION |
| 6 | +Enumerates tracked files and untracked-but-not-ignored files (via Git) beneath |
| 7 | +PWD. Skips likely-binary files (NUL probe). Classifies remaining files as: |
| 8 | + - 'utf8' : valid UTF-8 (no BOM) or empty file |
| 9 | + - 'utf8-with-bom' : starts with UTF-8 BOM (EF BB BF) |
| 10 | + - 'other' : text but not valid UTF-8 (e.g., UTF-16/ANSI) |
| 11 | +
|
| 12 | +Outputs: |
| 13 | + 1) Relative paths of files classified as 'other' |
| 14 | + 2) A table by extension: UTF8 / UTF8-with-BOM / Other / Total |
| 15 | +
|
| 16 | +Notes: |
| 17 | + - Read-only: this script makes no changes. |
| 18 | + - Requires Git and must be run inside a Git work tree. |
| 19 | +#> |
| 20 | + |
| 21 | +[CmdletBinding()] |
| 22 | +param() |
| 23 | + |
| 24 | +Set-StrictMode -Version Latest |
| 25 | +$ErrorActionPreference = 'Stop' |
| 26 | + |
| 27 | +# --- Git enumeration --------------------------------------------------------- |
| 28 | +function Assert-InGitWorkTree { |
| 29 | + # Throws if not inside a Git work tree. |
| 30 | + $inside = (& git rev-parse --is-inside-work-tree 2>$null).Trim() |
| 31 | + if ($LASTEXITCODE -ne 0 -or $inside -ne 'true') { |
| 32 | + throw 'Not in a Git work tree.' |
| 33 | + } |
| 34 | +} |
| 35 | + |
| 36 | +function Get-GitFilesUnderPwd { |
| 37 | + <# |
| 38 | + Returns full paths to tracked + untracked-not-ignored files under PWD. |
| 39 | + #> |
| 40 | + Assert-InGitWorkTree |
| 41 | + |
| 42 | + $repoRoot = (& git rev-parse --show-toplevel).Trim() |
| 43 | + $pwdPath = (Get-Location).Path |
| 44 | + |
| 45 | + # cached (tracked) + others (untracked not ignored) |
| 46 | + $nulSeparated = & git -C $repoRoot ls-files -z --cached --others --exclude-standard |
| 47 | + |
| 48 | + $relativePaths = $nulSeparated.Split( |
| 49 | + [char]0, [System.StringSplitOptions]::RemoveEmptyEntries) |
| 50 | + |
| 51 | + foreach ($relPath in $relativePaths) { |
| 52 | + $fullPath = Join-Path $repoRoot $relPath |
| 53 | + |
| 54 | + # Only include files under the current subtree. |
| 55 | + if ($fullPath.StartsWith($pwdPath, |
| 56 | + [System.StringComparison]::OrdinalIgnoreCase)) { |
| 57 | + if (Test-Path -LiteralPath $fullPath -PathType Leaf) { $fullPath } |
| 58 | + } |
| 59 | + } |
| 60 | +} |
| 61 | + |
| 62 | +# --- Probes ------------------------------------------------------------------ |
| 63 | +function Test-ProbablyBinary { |
| 64 | + # Heuristic: treat as binary if the first 8 KiB contains any NUL byte. |
| 65 | + param([Parameter(Mandatory)][string]$Path) |
| 66 | + |
| 67 | + try { |
| 68 | + $stream = [System.IO.File]::Open($Path,'Open','Read','ReadWrite') |
| 69 | + try { |
| 70 | + $len = [int][Math]::Min(8192,$stream.Length) |
| 71 | + if ($len -le 0) { return $false } |
| 72 | + |
| 73 | + $buffer = [byte[]]::new($len) |
| 74 | + [void]$stream.Read($buffer,0,$len) |
| 75 | + return ($buffer -contains 0) |
| 76 | + } |
| 77 | + finally { $stream.Dispose() } |
| 78 | + } |
| 79 | + catch { return $false } |
| 80 | +} |
| 81 | + |
| 82 | +function Get-TextEncodingCategory { |
| 83 | + # Returns 'utf8', 'utf8-with-bom', 'other', or $null for likely-binary. |
| 84 | + param([Parameter(Mandatory)][string]$Path) |
| 85 | + |
| 86 | + $stream = [System.IO.File]::Open($Path,'Open','Read','ReadWrite') |
| 87 | + try { |
| 88 | + $fileLength = $stream.Length |
| 89 | + if ($fileLength -eq 0) { return 'utf8' } |
| 90 | + |
| 91 | + # BOM check (EF BB BF) |
| 92 | + $header = [byte[]]::new([Math]::Min(3,$fileLength)) |
| 93 | + [void]$stream.Read($header,0,$header.Length) |
| 94 | + if ($header.Length -ge 3 -and |
| 95 | + $header[0] -eq 0xEF -and $header[1] -eq 0xBB -and $header[2] -eq 0xBF) { |
| 96 | + return 'utf8-with-bom' |
| 97 | + } |
| 98 | + |
| 99 | + # Quick binary probe before expensive decoding |
| 100 | + $stream.Position = 0 |
| 101 | + $sampleLen = [int][Math]::Min(8192,$fileLength) |
| 102 | + $sample = [byte[]]::new($sampleLen) |
| 103 | + [void]$stream.Read($sample,0,$sampleLen) |
| 104 | + if ($sample -contains 0) { return $null } |
| 105 | + } |
| 106 | + finally { $stream.Dispose() } |
| 107 | + |
| 108 | + # Validate UTF-8 by decoding with throw-on-invalid option (no BOM). |
| 109 | + try { |
| 110 | + $bytes = [System.IO.File]::ReadAllBytes($Path) |
| 111 | + $utf8 = [System.Text.UTF8Encoding]::new($false,$true) |
| 112 | + [void]$utf8.GetString($bytes) |
| 113 | + return 'utf8' |
| 114 | + } |
| 115 | + catch { return 'other' } |
| 116 | +} |
| 117 | + |
| 118 | +# --- Main -------------------------------------------------------------------- |
| 119 | +$otherFiles = @() |
| 120 | +$byExtension = @{} |
| 121 | + |
| 122 | +$allFiles = Get-GitFilesUnderPwd |
| 123 | + |
| 124 | +foreach ($fullPath in $allFiles) { |
| 125 | + # Avoid decoding likely-binary files. |
| 126 | + if (Test-ProbablyBinary $fullPath) { continue } |
| 127 | + |
| 128 | + $category = Get-TextEncodingCategory $fullPath |
| 129 | + if (-not $category) { continue } |
| 130 | + |
| 131 | + $ext = [IO.Path]::GetExtension($fullPath).ToLower() |
| 132 | + if (-not $byExtension.ContainsKey($ext)) { |
| 133 | + $byExtension[$ext] = @{ 'utf8' = 0; 'utf8-with-bom' = 0; 'other' = 0 } |
| 134 | + } |
| 135 | + |
| 136 | + $byExtension[$ext][$category]++ |
| 137 | + |
| 138 | + if ($category -eq 'other') { |
| 139 | + $otherFiles += (Resolve-Path -LiteralPath $fullPath -Relative) |
| 140 | + } |
| 141 | +} |
| 142 | + |
| 143 | +# 1) Files in 'other' |
| 144 | +if ($otherFiles.Count -gt 0) { |
| 145 | + 'Files classified as ''other'':' |
| 146 | + $otherFiles | Sort-Object | ForEach-Object { " $_" } |
| 147 | + '' |
| 148 | +} |
| 149 | + |
| 150 | +# 2) Table by extension |
| 151 | +$rows = foreach ($kv in $byExtension.GetEnumerator()) { |
| 152 | + $ext = if ($kv.Key) { $kv.Key } else { '[noext]' } |
| 153 | + $u = [int]$kv.Value['utf8'] |
| 154 | + $b = [int]$kv.Value['utf8-with-bom'] |
| 155 | + $o = [int]$kv.Value['other'] |
| 156 | + |
| 157 | + [PSCustomObject]@{ |
| 158 | + Extension = $ext |
| 159 | + UTF8 = $u |
| 160 | + 'UTF8-with-BOM' = $b |
| 161 | + Other = $o |
| 162 | + Total = $u + $b + $o |
| 163 | + } |
| 164 | +} |
| 165 | + |
| 166 | +$rows | |
| 167 | + Sort-Object -Property ( |
| 168 | + @{Expression='Total';Descending=$true}, |
| 169 | + @{Expression='Extension';Descending=$false} |
| 170 | + ) | |
| 171 | + Format-Table -AutoSize |
0 commit comments