@@ -5,17 +5,17 @@ Classify text files by encoding under the current subtree, respecting .gitignore
55. DESCRIPTION
66Enumerates tracked files and untracked-but-not-ignored files (via Git) beneath
77PWD. Skips likely-binary files (NUL probe). Classifies remaining files as:
8- - 'utf8' : valid UTF-8 (no BOM) or empty file
9- - 'utf8-with-bom' : starts with UTF-8 BOM (EF BB BF)
10- - 'other' : text but not valid UTF-8 (e.g., UTF-16/ANSI)
8+ - 'utf8' : valid UTF-8 (no BOM) or empty file
9+ - 'utf8-with-bom' : starts with UTF-8 BOM (EF BB BF)
10+ - 'other' : text but not valid UTF-8 (e.g., UTF-16/ANSI)
1111
1212Outputs:
13- 1) Relative paths of files classified as 'other'
14- 2) A table by extension: UTF8 / UTF8-with-BOM / Other / Total
13+ 1) Relative paths of files classified as 'other'
14+ 2) A table by extension: UTF8 / UTF8-with-BOM / Other / Total
1515
1616Notes:
17- - Read-only: this script makes no changes.
18- - Requires Git and must be run inside a Git work tree.
17+ - Read-only: this script makes no changes.
18+ - Requires Git and must be run inside a Git work tree.
1919#>
2020
2121[CmdletBinding ()]
@@ -26,93 +26,93 @@ $ErrorActionPreference = 'Stop'
2626
2727# --- Git enumeration ---------------------------------------------------------
2828function Assert-InGitWorkTree {
29- # Throws if not inside a Git work tree.
30- $inside = (& git rev- parse - -is - inside- work- tree 2> $null ).Trim()
31- if ($LASTEXITCODE -ne 0 -or $inside -ne ' true' ) {
32- throw ' Not in a Git work tree.'
33- }
29+ # Throws if not inside a Git work tree.
30+ $inside = (& git rev- parse - -is - inside- work- tree 2> $null ).Trim()
31+ if ($LASTEXITCODE -ne 0 -or $inside -ne ' true' ) {
32+ throw ' Not in a Git work tree.'
33+ }
3434}
3535
3636function Get-GitFilesUnderPwd {
37- <#
38- Returns full paths to tracked + untracked-not-ignored files under PWD.
39- #>
40- Assert-InGitWorkTree
37+ <#
38+ Returns full paths to tracked + untracked-not-ignored files under PWD.
39+ #>
40+ Assert-InGitWorkTree
4141
42- $repoRoot = (& git rev- parse -- show-toplevel ).Trim()
43- $pwdPath = (Get-Location ).Path
42+ $repoRoot = (& git rev- parse -- show-toplevel ).Trim()
43+ $pwdPath = (Get-Location ).Path
4444
45- # cached (tracked) + others (untracked not ignored)
46- $nulSeparated = & git - C $repoRoot ls- files - z -- cached -- others -- exclude- standard
45+ # cached (tracked) + others (untracked not ignored)
46+ $nulSeparated = & git - C $repoRoot ls- files - z -- cached -- others -- exclude- standard
4747
48- $relativePaths = $nulSeparated.Split (
49- [char ]0 , [System.StringSplitOptions ]::RemoveEmptyEntries)
48+ $relativePaths = $nulSeparated.Split (
49+ [char ]0 , [System.StringSplitOptions ]::RemoveEmptyEntries)
5050
51- foreach ($relPath in $relativePaths ) {
52- $fullPath = Join-Path $repoRoot $relPath
51+ foreach ($relPath in $relativePaths ) {
52+ $fullPath = Join-Path $repoRoot $relPath
5353
54- # Only include files under the current subtree.
55- if ($fullPath.StartsWith ($pwdPath ,
56- [System.StringComparison ]::OrdinalIgnoreCase)) {
57- if (Test-Path - LiteralPath $fullPath - PathType Leaf) { $fullPath }
58- }
59- }
54+ # Only include files under the current subtree.
55+ if ($fullPath.StartsWith ($pwdPath ,
56+ [System.StringComparison ]::OrdinalIgnoreCase)) {
57+ if (Test-Path - LiteralPath $fullPath - PathType Leaf) { $fullPath }
58+ }
59+ }
6060}
6161
6262# --- Probes ------------------------------------------------------------------
6363function Test-ProbablyBinary {
64- # Heuristic: treat as binary if the first 8 KiB contains any NUL byte.
65- param ([Parameter (Mandatory )][string ]$Path )
66-
67- try {
68- $stream = [System.IO.File ]::Open($Path , ' Open' , ' Read' , ' ReadWrite' )
69- try {
70- $len = [int ][Math ]::Min(8192 , $stream.Length )
71- if ($len -le 0 ) { return $false }
72-
73- $buffer = [byte []]::new($len )
74- [void ]$stream.Read ($buffer , 0 , $len )
75- return ($buffer -contains 0 )
76- }
77- finally { $stream.Dispose () }
78- }
79- catch { return $false }
64+ # Heuristic: treat as binary if the first 8 KiB contains any NUL byte.
65+ param ([Parameter (Mandatory )][string ]$Path )
66+
67+ try {
68+ $stream = [System.IO.File ]::Open($Path , ' Open' , ' Read' , ' ReadWrite' )
69+ try {
70+ $len = [int ][Math ]::Min(8192 , $stream.Length )
71+ if ($len -le 0 ) { return $false }
72+
73+ $buffer = [byte []]::new($len )
74+ [void ]$stream.Read ($buffer , 0 , $len )
75+ return ($buffer -contains 0 )
76+ }
77+ finally { $stream.Dispose () }
78+ }
79+ catch { return $false }
8080}
8181
8282function Get-TextEncodingCategory {
83- # Returns 'utf8', 'utf8-with-bom', 'other', or $null for likely-binary.
84- param ([Parameter (Mandatory )][string ]$Path )
85-
86- $stream = [System.IO.File ]::Open($Path , ' Open' , ' Read' , ' ReadWrite' )
87- try {
88- $fileLength = $stream.Length
89- if ($fileLength -eq 0 ) { return ' utf8' }
90-
91- # BOM check (EF BB BF)
92- $header = [byte []]::new([Math ]::Min(3 , $fileLength ))
93- [void ]$stream.Read ($header , 0 , $header.Length )
94- if ($header.Length -ge 3 -and
95- $header [0 ] -eq 0xEF -and $header [1 ] -eq 0xBB -and $header [2 ] -eq 0xBF ) {
96- return ' utf8-with-bom'
97- }
98-
99- # Quick binary probe before expensive decoding
100- $stream.Position = 0
101- $sampleLen = [int ][Math ]::Min(8192 , $fileLength )
102- $sample = [byte []]::new($sampleLen )
103- [void ]$stream.Read ($sample , 0 , $sampleLen )
104- if ($sample -contains 0 ) { return $null }
105- }
106- finally { $stream.Dispose () }
107-
108- # Validate UTF-8 by decoding with throw-on-invalid option (no BOM).
109- try {
110- $bytes = [System.IO.File ]::ReadAllBytes($Path )
111- $utf8 = [System.Text.UTF8Encoding ]::new($false , $true )
112- [void ]$utf8.GetString ($bytes )
113- return ' utf8'
114- }
115- catch { return ' other' }
83+ # Returns 'utf8', 'utf8-with-bom', 'other', or $null for likely-binary.
84+ param ([Parameter (Mandatory )][string ]$Path )
85+
86+ $stream = [System.IO.File ]::Open($Path , ' Open' , ' Read' , ' ReadWrite' )
87+ try {
88+ $fileLength = $stream.Length
89+ if ($fileLength -eq 0 ) { return ' utf8' }
90+
91+ # BOM check (EF BB BF)
92+ $header = [byte []]::new([Math ]::Min(3 , $fileLength ))
93+ [void ]$stream.Read ($header , 0 , $header.Length )
94+ if ($header.Length -ge 3 -and
95+ $header [0 ] -eq 0xEF -and $header [1 ] -eq 0xBB -and $header [2 ] -eq 0xBF ) {
96+ return ' utf8-with-bom'
97+ }
98+
99+ # Quick binary probe before expensive decoding
100+ $stream.Position = 0
101+ $sampleLen = [int ][Math ]::Min(8192 , $fileLength )
102+ $sample = [byte []]::new($sampleLen )
103+ [void ]$stream.Read ($sample , 0 , $sampleLen )
104+ if ($sample -contains 0 ) { return $null }
105+ }
106+ finally { $stream.Dispose () }
107+
108+ # Validate UTF-8 by decoding with throw-on-invalid option (no BOM).
109+ try {
110+ $bytes = [System.IO.File ]::ReadAllBytes($Path )
111+ $utf8 = [System.Text.UTF8Encoding ]::new($false , $true )
112+ [void ]$utf8.GetString ($bytes )
113+ return ' utf8'
114+ }
115+ catch { return ' other' }
116116}
117117
118118# --- Main --------------------------------------------------------------------
@@ -122,50 +122,50 @@ $byExtension = @{}
122122$allFiles = Get-GitFilesUnderPwd
123123
124124foreach ($fullPath in $allFiles ) {
125- # Avoid decoding likely-binary files.
126- if (Test-ProbablyBinary $fullPath ) { continue }
125+ # Avoid decoding likely-binary files.
126+ if (Test-ProbablyBinary $fullPath ) { continue }
127127
128- $category = Get-TextEncodingCategory $fullPath
129- if (-not $category ) { continue }
128+ $category = Get-TextEncodingCategory $fullPath
129+ if (-not $category ) { continue }
130130
131- $ext = [IO.Path ]::GetExtension($fullPath ).ToLower()
132- if (-not $byExtension.ContainsKey ($ext )) {
133- $byExtension [$ext ] = @ { ' utf8' = 0 ; ' utf8-with-bom' = 0 ; ' other' = 0 }
134- }
131+ $ext = [IO.Path ]::GetExtension($fullPath ).ToLower()
132+ if (-not $byExtension.ContainsKey ($ext )) {
133+ $byExtension [$ext ] = @ { ' utf8' = 0 ; ' utf8-with-bom' = 0 ; ' other' = 0 }
134+ }
135135
136- $byExtension [$ext ][$category ]++
136+ $byExtension [$ext ][$category ]++
137137
138- if ($category -eq ' other' ) {
139- $otherFiles += (Resolve-Path - LiteralPath $fullPath - Relative)
140- }
138+ if ($category -eq ' other' ) {
139+ $otherFiles += (Resolve-Path - LiteralPath $fullPath - Relative)
140+ }
141141}
142142
143143# 1) Files in 'other'
144144if ($otherFiles.Count -gt 0 ) {
145- ' Files classified as '' other'' :'
146- $otherFiles | Sort-Object | ForEach-Object { " $_ " }
147- ' '
145+ ' Files classified as '' other'' :'
146+ $otherFiles | Sort-Object | ForEach-Object { " $_ " }
147+ ' '
148148}
149149
150150# 2) Table by extension
151151$rows = foreach ($kv in $byExtension.GetEnumerator ()) {
152- $ext = if ($kv.Key ) { $kv.Key } else { ' [noext]' }
153- $u = [int ]$kv.Value [' utf8' ]
154- $b = [int ]$kv.Value [' utf8-with-bom' ]
155- $o = [int ]$kv.Value [' other' ]
156-
157- [PSCustomObject ]@ {
158- Extension = $ext
159- UTF8 = $u
160- ' UTF8-with-BOM' = $b
161- Other = $o
162- Total = $u + $b + $o
163- }
152+ $ext = if ($kv.Key ) { $kv.Key } else { ' [noext]' }
153+ $u = [int ]$kv.Value [' utf8' ]
154+ $b = [int ]$kv.Value [' utf8-with-bom' ]
155+ $o = [int ]$kv.Value [' other' ]
156+
157+ [PSCustomObject ]@ {
158+ Extension = $ext
159+ UTF8 = $u
160+ ' UTF8-with-BOM' = $b
161+ Other = $o
162+ Total = $u + $b + $o
163+ }
164164}
165165
166166$rows |
167- Sort-Object - Property (
168- @ {Expression = ' Total' ;Descending = $true },
169- @ {Expression = ' Extension' ;Descending = $false }
170- ) |
171- Format-Table - AutoSize
167+ Sort-Object - Property (
168+ @ {Expression = ' Total' ;Descending = $true },
169+ @ {Expression = ' Extension' ;Descending = $false }
170+ ) |
171+ Format-Table - AutoSize
0 commit comments