11Imports System.IO
22Imports System.IO.Compression
3+ Imports System.Runtime.InteropServices
34
45Imports K4os.Compression.LZ4
56
67Imports K4os.Compression.LZ4.Streams
78
9+ Imports Microsoft.Win32.SafeHandles
10+
811
912'https://unix.stackexchange.com/questions/155901/estimate-compressibility-of-file
1013
@@ -29,6 +32,7 @@ Public Class Estimator
2932 Public CompressionRatio As Single
3033 End Class
3134
35+ Public Shared IsAlternate As Boolean = False
3236
3337 Public Function EstimateCompressability(analysisResult As List( Of AnalysedFileDetails), ishdd As Boolean , Optional MaxParallelism As Integer = 1 , Optional clusterSize As Integer = 4096 , Optional cancellationToken As Threading.CancellationToken = Nothing ) As List( Of (AnalysedFile As AnalysedFileDetails, CompressionRatio As Single ))
3438
@@ -38,26 +42,37 @@ Public Class Estimator
3842
3943 Me .IsHDD = ishdd
4044
45+
46+ 'Filter the files based on the cluster size and sort them by cluster location if it's an HDD to minimize seek time
47+ Dim filteredList = analysisResult.Where( Function (f) f.UncompressedSize > clusterSize)
48+
49+ Dim sw = Stopwatch.StartNew()
50+
51+ If ishdd Then filteredList = filteredList.OrderBy( Function (f) GetFirstLcn(f.FileName))
52+ Dim finalList = filteredList.ToList
53+ sw.Stop()
54+ Debug.WriteLine( $"Filtered and sorted {finalList.Count} files in {sw.ElapsedMilliseconds} ms" )
55+ If IsAlternate Then Debug.WriteLine( "Alternate mode enabled, using different handle method." )
56+
57+
4158 Dim paraOptions As New ParallelOptions With {.MaxDegreeOfParallelism = MaxParallelism}
4259
43- Parallel.ForEach(analysisResult, parallelOptions:=paraOptions, Sub (fl)
60+ Parallel.ForEach(finalList, parallelOptions:=paraOptions, Sub (fl)
61+ cancellationToken.ThrowIfCancellationRequested()
4462
45- If fl.UncompressedSize > clusterSize Then
46- If cancellationToken <> Nothing AndAlso cancellationToken.IsCancellationRequested Then
47- Throw New OperationCanceledException(cancellationToken)
48- End If
49- Dim estimatedRatio = EstimateCompressabilityLZ4(fl.FileName, fl.UncompressedSize, cancellationToken)
63+ Dim estimatedRatio = EstimateCompressabilityLZ4(fl.FileName, fl.UncompressedSize, cancellationToken)
5064
51- _filesList.Add(( New FileDetails With {.AnalysedFile = fl, .CompressionRatio = estimatedRatio}))
52- End If
53- End Sub )
65+ _filesList.Add(( New FileDetails With {.AnalysedFile = fl, .CompressionRatio = estimatedRatio}))
66+ End Sub )
5467
5568 Dim retList As New List( Of (AnalysedFile As AnalysedFileDetails, CompressionRatio As Single ))
5669
5770 For Each item In _filesList
5871 retList.Add((item.AnalysedFile, item.CompressionRatio))
5972 Next
6073
74+ IsAlternate = Not IsAlternate ' Toggle the alternate mode for next run
75+
6176 Return retList
6277
6378 End Function
@@ -114,9 +129,7 @@ Public Class Estimator
114129 Dim bytesRead As Integer = input.Read(buffer, 0 , BlockSize)
115130
116131 While bytesRead > 0
117- If cancellationToken <> Nothing AndAlso cancellationToken.IsCancellationRequested Then
118- Throw New OperationCanceledException(cancellationToken)
119- End If
132+ cancellationToken.ThrowIfCancellationRequested()
120133 compressionStream.Write(buffer, 0 , bytesRead)
121134 totalWritten += bytesRead
122135 bytesRead = input.Read(buffer, 0 , BlockSize)
@@ -127,9 +140,7 @@ Public Class Estimator
127140 Dim stepSize As Long = BlockSize * (totalBlocks \ samplesNeeded)
128141 Dim buffer(BlockSize - 1 ) As Byte
129142 For i As Integer = 0 To samplesNeeded - 1
130- If cancellationToken <> Nothing AndAlso cancellationToken.IsCancellationRequested Then
131- Throw New OperationCanceledException(cancellationToken)
132- End If
143+ cancellationToken.ThrowIfCancellationRequested()
133144 input.Position = stepSize * i
134145 Dim bytesRead As Integer = input.Read(buffer, 0 , BlockSize)
135146 compressionStream.Write(buffer, 0 , bytesRead)
@@ -141,33 +152,6 @@ Public Class Estimator
141152 Return Math.Min(compressed.Length / Math.Max(totalWritten, 1 ), 1 . 0 )
142153 End Function
143154
144- 'Private Function EstimateCompressabilityHDD(input As FileStream, fileSize As Long, compressionFactory As CompressionStreamFactory, Optional cancellationToken As Threading.CancellationToken = Nothing) As Double
145- ' Dim MiddleChunkSize As Integer = SampleSize * BlockSize ' 10KB
146-
147- ' Dim totalWritten As Long = 0
148- ' Dim compressed = New MemoryStream()
149-
150- ' Using compressionStream As Stream = compressionFactory(compressed)
151- ' ' If file is smaller than 10KB, just use the whole file
152- ' Dim chunkSize As Integer = CInt(Math.Min(MiddleChunkSize, fileSize))
153- ' Dim middleStart As Long = Math.Max(0, (fileSize \ 2) - (chunkSize \ 2))
154-
155- ' Dim buffer(chunkSize - 1) As Byte
156- ' input.Position = middleStart
157- ' Dim bytesRead As Integer = input.Read(buffer, 0, chunkSize)
158-
159- ' If cancellationToken <> Nothing AndAlso cancellationToken.IsCancellationRequested Then
160- ' Throw New OperationCanceledException(cancellationToken)
161- ' End If
162-
163- ' If bytesRead > 0 Then
164- ' compressionStream.Write(buffer, 0, bytesRead)
165- ' totalWritten += bytesRead
166- ' End If
167- ' End Using
168-
169- ' Return Math.Min(compressed.Length / Math.Max(totalWritten, 1), 1.0)
170- 'End Function
171155
172156 Private Function EstimateCompressabilityHDD(input As FileStream, fileSize As Long , compressionFactory As CompressionStreamFactory, Optional cancellationToken As Threading.CancellationToken = Nothing ) As Double
173157 Dim NumClusters As Integer = SampleSize ' or any small number you want to sample
@@ -198,6 +182,102 @@ Public Class Estimator
198182 Return Math.Min(compressed.Length / Math.Max(totalWritten, 1 ), 1 . 0 )
199183 End Function
200184
185+ Public Function GetFirstLcn(filePath As String ) As Long
186+
187+ Dim handle As SafeFileHandle = File.OpenHandle(filePath)
188+
189+ If handle.IsInvalid Then Throw New IOException( "Failed to open file handle." )
190+
191+
192+ Dim inBuffer As NtfsInterop.STARTING_VCN_INPUT_BUFFER
193+ inBuffer.StartingVcn = 0
194+ Dim inBufferSize = Marshal.SizeOf(inBuffer)
195+ Dim inBufferPtr = Marshal.AllocHGlobal(inBufferSize)
196+ Marshal.StructureToPtr(inBuffer, inBufferPtr, False )
197+
198+ Dim outBufferSize = 4096
199+ Dim outBufferPtr = Marshal.AllocHGlobal(outBufferSize)
200+ Dim bytesReturned As Integer = 0
201+
202+ Try
203+ Dim result = NtfsInterop.DeviceIoControl(
204+ handle,
205+ NtfsInterop.FSCTL_GET_RETRIEVAL_POINTERS,
206+ inBufferPtr,
207+ inBufferSize,
208+ outBufferPtr,
209+ outBufferSize,
210+ bytesReturned,
211+ IntPtr.Zero
212+ )
213+
214+ 'Probably errors because the file is empty
215+ If Not result Then Return Long .MaxValue
216+
217+
218+ ' Marshal the output buffer to get the first LCN
219+ Dim extentOffset As Integer = Marshal.OffsetOf( Of RETRIEVAL_POINTERS_BUFFER)( "Extents" ).ToInt32()
220+ Dim lcn As Long = Marshal.ReadInt64(outBufferPtr, extentOffset + 8 )
221+ Return lcn
222+
223+ Finally
224+ Marshal.FreeHGlobal(inBufferPtr)
225+ Marshal.FreeHGlobal(outBufferPtr)
226+ handle.Close()
227+ End Try
228+ End Function
229+
230+ End Class
231+
232+ Friend Module NtfsInterop
233+ Public Const FSCTL_GET_RETRIEVAL_POINTERS As UInteger = &H90073
234+ Public Const OPEN_EXISTING As Integer = 3
235+ Public Const FILE_FLAG_BACKUP_SEMANTICS As Integer = &H2000000
236+ Public Const FILE_SHARE_READ As Integer = 1
237+ Public Const FILE_SHARE_WRITE As Integer = 2
238+ Public Const GENERIC_READ As Integer = &H80000000
239+
240+ <DllImport( "kernel32.dll" , SetLastError:= True , CharSet:=CharSet.Unicode)>
241+ Public Function CreateFile(
242+ lpFileName As String ,
243+ dwDesiredAccess As Integer ,
244+ dwShareMode As Integer ,
245+ lpSecurityAttributes As IntPtr,
246+ dwCreationDisposition As Integer ,
247+ dwFlagsAndAttributes As Integer ,
248+ hTemplateFile As IntPtr
249+ ) As SafeFileHandle
250+ End Function
251+
252+ <DllImport( "kernel32.dll" , SetLastError:= True )>
253+ Public Function DeviceIoControl(
254+ hDevice As SafeFileHandle,
255+ dwIoControlCode As UInteger,
256+ lpInBuffer As IntPtr,
257+ nInBufferSize As Integer ,
258+ lpOutBuffer As IntPtr,
259+ nOutBufferSize As Integer ,
260+ ByRef lpBytesReturned As Integer ,
261+ lpOverlapped As IntPtr
262+ ) As Boolean
263+ End Function
201264
202265
203- End Class
266+ <StructLayout(LayoutKind.Sequential)>
267+ Public Structure STARTING_VCN_INPUT_BUFFER
268+ Public StartingVcn As Long
269+ End Structure
270+
271+ <StructLayout(LayoutKind.Sequential)>
272+ Public Structure RETRIEVAL_POINTERS_BUFFER
273+ Public ExtentCount As Integer
274+ Public StartingVcn As Long
275+ Public Extents As LCN_EXTENT ' This is actually an array, but we only need the first
276+ End Structure
277+
278+ <StructLayout(LayoutKind.Sequential)>
279+ Public Structure LCN_EXTENT
280+ Public NextVcn As Long
281+ Public Lcn As Long
282+ End Structure
283+ End Module
0 commit comments