Skip to content

Commit 6a9465b

Browse files
committed
Update hasher func
1 parent fe48ee9 commit 6a9465b

File tree

4 files changed

+59
-133
lines changed

4 files changed

+59
-133
lines changed

.vscode/settings.json

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
{
22
"cSpell.words": [
3+
"chans",
34
"Dendra",
5+
"dendrascience",
46
"djafs",
57
"djfl",
68
"djfm",
@@ -13,6 +15,7 @@
1315
"repacker",
1416
"Subdirs",
1517
"subfolders",
18+
"taigrr",
1619
"toplevel",
1720
"uniconverter"
1821
]

cmd/uniconverter/main.go

Lines changed: 5 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@ package main
33
import (
44
"encoding/gob"
55
"flag"
6-
"log"
76
"os"
87
"path/filepath"
98
"strings"
@@ -50,8 +49,10 @@ func main() {
5049
// The filesystem is created at the output path.
5150
os.MkdirAll(*outputPath, 0o777)
5251
saveState, err := os.Open(filepath.Join(*outputPath, "boundaries.gob"))
52+
boundaries := []util.ZipBoundary{}
5353
if err != nil {
54-
boundaries, err := util.DetermineZipBoundaries(*directoryPath, *thresholdSize)
54+
55+
boundaries, err = util.DetermineZipBoundaries(*directoryPath, *thresholdSize)
5556
if err != nil {
5657
panic(err)
5758
}
@@ -62,7 +63,6 @@ func main() {
6263
gob.NewEncoder(f).Encode(boundaries)
6364
f.Close()
6465
} else {
65-
boundaries := []util.ZipBoundary{}
6666
gob.NewDecoder(saveState).Decode(&boundaries)
6767
saveState.Close()
6868
}
@@ -71,62 +71,21 @@ func main() {
7171
if err != nil {
7272
panic(err)
7373
}
74-
subpath := strings.TrimPrefix(sf, *directoryPath)
75-
newPath := filepath.Join(*outputPath, util.MappingDir, subpath)
76-
err = os.MkdirAll(newPath, 0o777)
77-
if err != nil {
78-
panic(err)
79-
}
80-
err = util.WriteJSONFile(filepath.Join(newPath, "subdirs.djfl"), lt)
81-
if err != nil {
82-
panic(err)
83-
}
84-
}
85-
86-
for _, sf := range subfiles {
87-
lt, err := util.CreateInitialDJAFSManifest(sf, *outputPath, true)
88-
if err != nil {
89-
panic(err)
90-
}
91-
subpath := strings.TrimPrefix(sf, *directoryPath)
74+
subpath := strings.TrimPrefix(boundary.Path, *directoryPath)
9275
newPath := filepath.Join(*outputPath, util.MappingDir, subpath)
9376
err = os.MkdirAll(newPath, 0o777)
9477
if err != nil {
9578
panic(err)
9679
}
97-
err = util.WriteJSONFile(filepath.Join(newPath, "subfiles.djfl"), lt)
80+
err = util.WriteJSONFile(filepath.Join(newPath, "lookups.djfl"), lt)
9881
if err != nil {
9982
panic(err)
10083
}
10184
}
102-
log.Println("created initial manifest files")
10385

10486
err = util.GCWorkDirs(filepath.Join(*outputPath, util.WorkDir))
10587
if err != nil {
10688
panic(err)
10789
}
108-
// for _, dir := range subfolders {
109-
// err := util.CreateDJAFSArchive(dir, false)
110-
// if err != nil {
111-
// panic(err)
112-
// }
113-
// }
114-
// for _, dir := range subfiles {
115-
// err := util.CreateDJAFSArchive(dir, true)
116-
// if err != nil {
117-
// panic(err)
118-
// }
119-
// }
120-
121-
// for each file under the subfiles path,
122-
// hash the file and create an entry in the metadata file.
123-
// then, zip all the files in the subfiles path into a .djfz (zip) file.
124-
125-
// for each folder under the subfolders path,
126-
// hash all the files in the folder and create an entry in the metadata file.
127-
// for empty folders, create an entry in the metadata file pointing to that folder
128-
// then, zip all the files in the subfolders path into a .djfz (zip) file.
12990

130-
// for all the .djfz files in the subfolders and subfiles path,
131-
// record the metrics into the djfl file. for api entries later
13291
}

util/compress.go

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ func NewDJFZ(path string) (DJFZ, error) {
3535
// if err != nil {
3636
// return LookupTable{}, err
3737
// }
38-
// f, err := zrc.Open("lookup.djfl")
38+
// f, err := zrc.Open("lookups.djfl")
3939
// if err != nil {
4040
// return LookupTable{}, err
4141
// }
@@ -55,7 +55,7 @@ func LookupFromDJFZ(path string) (LookupTable, error) {
5555
if err != nil {
5656
return LookupTable{}, err
5757
}
58-
f, err := zrc.Open("lookup.djfl")
58+
f, err := zrc.Open("lookups.djfl")
5959
if err != nil {
6060
return LookupTable{}, err
6161
}
@@ -132,10 +132,8 @@ func CompressHashed(path string, dest string) error {
132132
}
133133

134134
func ZipInside(path string, filesOnly bool) error {
135-
filename := "subdirs.djfz"
136-
if filesOnly {
137-
filename = "files.djfz"
138-
}
135+
filename := "files.djfz"
136+
139137
info, err := os.Stat(path)
140138
if err != nil {
141139
return err

util/hasher.go

Lines changed: 47 additions & 81 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ import (
1212
"runtime"
1313
"sort"
1414
"strings"
15+
"sync"
1516
"time"
1617

1718
"github.com/taigrr/colorhash"
@@ -84,7 +85,7 @@ func CreateFileLookupEntry(path, workDirPath string, initial bool) (LookupEntry,
8485
hash, err := GetFileHash(path)
8586

8687
_, err = CopyToWorkDir(path, workDirPath, hash)
87-
l.Target = HashPathFromHashInitial(hash, workDirPath) + filepath.Ext(path)
88+
l.Target = filepath.Join(hash, filepath.Ext(path))
8889
l.Name = path
8990
l.Modified = info.ModTime()
9091
l.FileSize = info.Size()
@@ -109,7 +110,9 @@ type lookupWorkerData struct {
109110
initial bool
110111
}
111112

112-
func initialLookupWorker(lwd chan lookupWorkerData, c chan LookupEntry, errChan chan error, doneChan chan struct{}) {
113+
func initialLookupWorker(lwd <-chan lookupWorkerData, c chan<- LookupEntry, errChan chan<- error, wg *sync.WaitGroup) {
114+
defer wg.Done()
115+
113116
for x := range lwd {
114117
le, err := CreateFileLookupEntry(x.subpath, x.output, x.initial)
115118
if err != nil {
@@ -118,7 +121,6 @@ func initialLookupWorker(lwd chan lookupWorkerData, c chan LookupEntry, errChan
118121
}
119122
c <- le
120123
}
121-
doneChan <- struct{}{}
122124
}
123125

124126
func CreateInitialDJAFSManifest(path, output string, filesOnly bool) (LookupTable, error) {
@@ -127,16 +129,22 @@ func CreateInitialDJAFSManifest(path, output string, filesOnly bool) (LookupTabl
127129
} else {
128130
output = filepath.Join(output, WorkDir)
129131
}
132+
130133
lt := LookupTable{sorted: false, Entries: EntrySet{}}
131-
lookupEntryChan := make(chan LookupEntry, 1)
132-
errChan := make(chan error, 1)
133-
lwdChan := make(chan lookupWorkerData, 1)
134-
doneChan := make(chan struct{}, 1)
135-
threads := runtime.NumCPU()
136-
for i := 0; i < threads; i++ {
137-
go initialLookupWorker(lwdChan, lookupEntryChan, errChan, doneChan)
134+
lookupEntryChan := make(chan LookupEntry, runtime.NumCPU())
135+
errChan := make(chan error, runtime.NumCPU())
136+
lwdChan := make(chan lookupWorkerData, runtime.NumCPU())
137+
var wg sync.WaitGroup
138+
139+
// Start workers
140+
wg.Add(runtime.NumCPU())
141+
for i := 0; i < runtime.NumCPU(); i++ {
142+
go initialLookupWorker(lwdChan, lookupEntryChan, errChan, &wg)
138143
}
144+
145+
// Start walker
139146
go func() {
147+
defer close(lwdChan)
140148
err := filepath.WalkDir(path, func(subpath string, info os.DirEntry, err error) error {
141149
if err != nil {
142150
return err
@@ -156,34 +164,44 @@ func CreateInitialDJAFSManifest(path, output string, filesOnly bool) (LookupTabl
156164
if err != nil {
157165
errChan <- err
158166
}
159-
close(lwdChan)
160167
}()
161-
workLoop:
162-
for {
168+
169+
// Process results
170+
go func() {
171+
wg.Wait()
172+
close(lookupEntryChan)
173+
close(errChan)
174+
}()
175+
176+
var chansClosed bool
177+
for !chansClosed {
163178
select {
164-
case <-doneChan:
165-
threads--
166-
if threads == 0 {
167-
break workLoop
179+
case le, ok := <-lookupEntryChan:
180+
if !ok {
181+
chansClosed = true
182+
continue
168183
}
169-
case le := <-lookupEntryChan:
170184
lt.Entries = append(lt.Entries, le)
171-
case errCErr := <-errChan:
185+
case err, ok := <-errChan:
186+
if !ok {
187+
chansClosed = true
188+
continue
189+
}
172190
switch {
173-
case errCErr == nil:
174-
case os.IsNotExist(errCErr):
175-
case errors.Is(errCErr, ErrExpectedFile):
176-
case errors.Is(errCErr, ErrUnexpectedSymlink):
191+
case err == nil:
192+
continue
193+
case errors.Is(err, os.ErrNotExist):
194+
continue
195+
case errors.Is(err, ErrExpectedFile):
196+
continue
197+
case errors.Is(err, ErrUnexpectedSymlink):
198+
continue
177199
default:
178-
log.Printf("error walking path %s: %s", path, errCErr)
179-
return LookupTable{}, errCErr
200+
log.Printf("error walking path %s: %s", path, err)
201+
return LookupTable{}, err
180202
}
181203
}
182204
}
183-
184-
close(doneChan)
185-
close(lookupEntryChan)
186-
close(errChan)
187205
sort.Sort(lt.Entries)
188206
return lt, nil
189207
}
@@ -311,58 +329,6 @@ func HashPathFromHash(hash string) string {
311329
return fmt.Sprintf("%d-%05d-%s", first, second, third)
312330
}
313331

314-
func HashPathFromHashInitial(hash, workDir string) string {
315-
hInt := colorhash.HashString(hash)
316-
hInt = hInt % GlobalModulus
317-
first := hInt
318-
second := 0
319-
third := hash
320-
321-
// first, format directory prefix
322-
dir := filepath.Join(workDir, fmt.Sprintf("%05d", first))
323-
// check to see how many iterables are in that directory
324-
des, err := os.ReadDir(dir)
325-
// if that directory doesn't exist at all, just return the hash
326-
// as there's no need to iterate on a non-existent directory
327-
// TODO check for other errors
328-
if os.IsNotExist(err) || err != nil {
329-
return fmt.Sprintf("%05d-%05d-%s", first, second, third)
330-
}
331-
332-
// if there are no iterables in that directory, just return the hash
333-
if len(des) == 0 {
334-
return fmt.Sprintf("%05d-%05d-%s", first, second, third)
335-
}
336-
337-
// for each of the iterable directories inside of the parent
338-
for _, de := range des {
339-
// first make sure it's a directory before any other checks
340-
if de.IsDir() {
341-
// get the path to the iterable directory
342-
iDir := filepath.Join(dir, de.Name())
343-
// get the contents of the iterable directory
344-
iDEs, err := os.ReadDir(iDir)
345-
// if there's an error, just return the hash
346-
if err != nil {
347-
return fmt.Sprintf("%05d-%05d-%s", first, second, third)
348-
}
349-
// if there are less than GlobalModulus files in the iterable directory
350-
if len(iDEs) <= GlobalModulus {
351-
return fmt.Sprintf("%05d-%05d-%s", first, second, third)
352-
}
353-
// special case: if we've already seen this file, just return the hash
354-
maybeFile := filepath.Join(iDir, fmt.Sprintf("%05d-%05d-%s", first, second, third))
355-
_, err = os.Stat(maybeFile)
356-
if err != nil {
357-
return fmt.Sprintf("%05d-%05d-%s", first, second, third)
358-
}
359-
// otherwise, increment the second counter and try again
360-
second++
361-
}
362-
}
363-
return fmt.Sprintf("%05d-%05d-%s", first, second, third)
364-
}
365-
366332
func WorkspacePrefixFromHashPath(path string) (string, error) {
367333
parts := strings.Split(path, "-")
368334
if len(parts) < 3 {

0 commit comments

Comments
 (0)