Create stdlib/fs.jou and DirIter to loop through directory contents (#1273)

Akuli · web-flow · commit 17a434b96f8d · 2026-03-11T18:23:51.000+02:00
diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml
@@ -171,10 +171,11 @@ jobs:
         shell: bash
       - run: cp -r repo/tests repo/runtests.sh "test dir"
         shell: bash
-      - name: "Delete tests that depend on the compiler and cannot work without it"
+      - name: "Delete tests that depend on files outside the tests folder"
         run: |
           rm -v "test dir"/tests/should_succeed/compiler_unit_tests.jou
           rm -v "test dir"/tests/should_succeed/keywords.jou
+          rm -v "test dir"/tests/should_succeed/fs_test.jou
         shell: bash
       - run: cd "test dir" && ./jou.exe --verbose examples/hello.jou
         shell: bash
diff --git a/doc/fs.md b/doc/fs.md
@@ -0,0 +1,106 @@
+# File system utilities
+
+This file documents `stdlib/fs.jou`.
+
+
+## Iterating the contents of a directory
+
+TL;DR:
+
+```python
+iter = DirIter{dir = "path/to/some/directory"}
+while iter.next():
+    printf("%s\n", iter.path)  # path/to/some/directory/file.txt
+    printf("%s\n", iter.name)  # file.txt
+
+if iter.error_code != 0:
+    printf("Error: %s\n", iter.error_message)
+```
+
+The `DirIter` class can be used to loop through the files and folders in a directory
+(also known as folder).
+
+When creating a `DirIter`, you should set all unused fields to zero
+by e.g. using [the `ClassName{}` syntax](classes.md#instantiating-syntax) as shown above.
+You can set the following fields:
+- `dir: byte*` is a path to the directory being listed. This is the only field that you must set.
+- `include_dot_and_dotdot: bool` can be set to `True`
+    if you want to get the special `.` and `..` entries when iterating the directory.
+    They are skipped by default.
+
+You should call `iter.next()` repeatedly until it returns `False`.
+Return value `True` means that a file or subdirectory was found,
+and `iter.path` and `iter.name` were updated accordingly.
+Return value `False` means that either an error occurred or the end of the directory was reached.
+If `.next()` has already returned `False`, calling `.next()` again returns `False` without doing anything.
+
+The memory used for iterating is freed when `.next()` returns `False`.
+This means that you don't need any cleanup,
+but to avoid leaking memory and the underlying directory handle,
+you shouldn't stop calling `.next()` until you get the `False`.
+Please [create an issue on GitHub](https://github.com/Akuli/jou/issues/new)
+if you want to stop the iterating early.
+
+After calling `.next()`, you can use the following fields:
+- `path: byte*` is the path to the file or subdirectory inside the given `dir`.
+    It consists of `dir`, a slash if `dir` does not already end with a slash, and a file or subdirectory name.
+    The string in `iter.path` is only valid until the following call to `.next()`,
+    so if you want to use the string after the following call to `.next()`,
+    you need to make a copy of the string.
+    This field is `NULL` if `iter.next()` returned `False`.
+- `name: byte*` is the file or subdirectory name without the rest of the path.
+    Similarly to `iter.path`, this is only valid until the following call to `.next()`
+    and you may need to make a copy.
+    This field is `NULL` if `iter.next()` returned `False`.
+- `error_code: int` is nonzero if `iter.next()` returned `False` due to an error,
+    and zero if no error has occurred.
+    This is [a Windows API error number](https://learn.microsoft.com/en-us/windows/win32/debug/system-error-codes--0-499-) on Windows
+    and an [errno value](../stdlib/errno.jou) on other systems.
+- `error_message: byte[512]` is a human-readable error message,
+    or an empty string if no error has occurred.
+
+The iteration order is whatever the operating system and file system happen to produce,
+and you shouldn't rely on it.
+For example, you can [sort the strings](sorting.md#sorting-strings):
+
+```python
+import "stdlib/fs.jou"
+import "stdlib/io.jou"
+import "stdlib/list.jou"
+import "stdlib/mem.jou"
+import "stdlib/sort.jou"
+import "stdlib/str.jou"
+
+def main() -> int:
+    results = List[byte*]{}
+
+    iter = DirIter{dir = "doc/images"}
+    while iter.next():
+        results.append(strdup(iter.name))
+
+    if iter.error_code != 0:
+        printf("Error: %s\n", iter.error_message)
+        return 1
+
+    sort_strings(results.ptr, results.len)
+
+    # Output: 64bit-meme-small.jpg
+    # Output: 64bit-meme.jpg
+    # Output: sources.txt
+    for i = 0; i < results.len; i++:
+        puts(results.ptr[i])
+        free(results.ptr[i])  # Free the copy created with strdup()
+
+    free(results.ptr)
+    return 0
+```
+
+
+## Windows support
+
+On Windows, paths containing non-ASCII characters and very long paths may not work properly.
+The reason is that `stdlib/fs.jou` uses the ANSI versions of Windows API functions,
+such as `FindFirstFileA` and `FindNextFileA`.
+Please [create an issue on GitHub](https://github.com/Akuli/jou/issues/new)
+if you need to work with arbitrary Windows paths.
+A proper fix for this is planned, but not implemented.
diff --git a/doctest.sh b/doctest.sh
@@ -40,7 +40,7 @@ function generate_expected_output()
 {
     local joufile="$1"
 
-    (grep -onH '# Warning: .*' "$joufile" || true) | sed -E s/'(.*):([0-9]*):# Warning: '/'compiler warning for file "test.jou", line \2: '/
+    (grep -onH '# Warning: .*' "$joufile" || true) | sed -E s/'(.*):([0-9]*):# Warning: '/'compiler warning for file "\1", line \2: '/
     (grep -onH '# Error: .*' "$joufile" || true) | sed -E s/'(.*):([0-9]*):# Error: '/'compiler error in file "\1", line \2: '/
     (grep -oE '# Output:.*' "$joufile" || true) | sed -E s/'^# Output: ?'//
 }
@@ -67,14 +67,26 @@ done
 ntotal=0
 nfail=0
 
-cd tmp/doctest
-for file in */*.jou; do
+for file in tmp/doctest/*/*.jou; do
     # Print file and line number, as in "doc/foo.md:123: "
     # Newline is deleted to avoid warning on NetBSD 9.3, see issue #500
-    echo -n "$(basename "$(dirname "$file")" | tr -d '\n' | base64 -d):$(basename "$file" | cut -d'.' -f1 | sed 's/^0*//'): "
+    md_file="$(basename "$(dirname "$file")" | tr -d '\n' | base64 -d)"
+    md_lineno=$(basename "$file" | cut -d'.' -f1 | sed 's/^0*//')
+    echo -n "$md_file:$md_lineno: "
 
-    cp "$file" test.jou
-    if $diff --text -u <(generate_expected_output test.jou | tr -d '\r') <( ("$jou" test.jou 2>&1 || true) | tr -d '\r'); then
+    cp "$file" tmp/doctest/test.jou
+
+    if [[ $md_file =~ fs.md ]]; then
+        # These doctests refer files by path
+        working_dir="."
+        relative_path="tmp/doctest/test.jou"
+    else
+        # Some doctests contain assertion failures that mention "test.jou"
+        working_dir="tmp/doctest"
+        relative_path="test.jou"
+    fi
+
+    if $diff --text -u <(cd "$working_dir" && generate_expected_output "$relative_path" | tr -d '\r') <( (cd "$working_dir" && "$jou" "$relative_path" 2>&1 || true) | tr -d '\r'); then
         echo "ok"
     else
         ((nfail++)) || true
diff --git a/stdlib/fs.jou b/stdlib/fs.jou
@@ -0,0 +1,232 @@
+import "stdlib/list.jou"
+import "stdlib/mem.jou"
+import "stdlib/str.jou"
+
+if WINDOWS:
+    import "stdlib/assert.jou"
+
+    # TODO: This should really use W functions, not A functions.
+    #       But then we would need a way to convert between byte* and uint16*.
+    #       I have an idea for that (WTF-8) but I haven't implemented it yet.
+    #       The rest of the standard library will need changes too.
+    class WIN32_FIND_DATAA:
+        dwFileAttributes: uint32
+        ftCreationTime: uint32[2]
+        ftLastAccessTime: uint32[2]
+        ftLastWriteTime: uint32[2]
+        nFileSizeHigh: uint32
+        nFileSizeLow: uint32
+        dwReserved0: uint32
+        dwReserved1: uint32
+        cFileName: byte[260]  # TODO: this can be quite limiting
+        cAlternateFileName: byte[14]
+
+    declare FindFirstFileA(FileName: byte*, FindFileData: WIN32_FIND_DATAA*) -> int64
+    declare FindNextFileA(hFindFile: int64, FindFileData: WIN32_FIND_DATAA*) -> int
+    declare FindClose(hFindFile: int64) -> int
+    const INVALID_HANDLE_VALUE: int64 = -1
+
+    declare GetLastError() -> uint32
+    const ERROR_PATH_NOT_FOUND: uint32 = 3
+    const ERROR_NO_MORE_FILES: uint32 = 18
+
+    declare FormatMessageA(
+        dwFlags: uint32,
+        lpSource: void*,
+        dwMessageId: uint32,
+        dwLanguageId: uint32,
+        lpBuffer: byte*,
+        nSize: uint32,
+        Arguments: void*,  # actually va_list*
+    ) -> uint32
+    const FORMAT_MESSAGE_IGNORE_INSERTS: uint32 = 0x00000200
+    const FORMAT_MESSAGE_FROM_SYSTEM: uint32 = 0x00001000
+
+else:
+    import "stdlib/errno.jou"
+    import "stdlib/intnative.jou"
+
+    if LINUX:
+        # There are two versions of strerror_r(), and the one actually named
+        # strerror_r() is the wrong one.
+        declare __xpg_strerror_r(errnum: int, buf: byte*, buflen: intnative) -> int
+        def strerror_r(errnum: int, buf: byte*, buflen: intnative) -> int:
+            return __xpg_strerror_r(errnum, buf, buflen)
+    else:
+        declare strerror_r(errnum: int, buf: byte*, buflen: intnative) -> int
+
+    class DIR:
+        pass
+
+    if LINUX:
+        class dirent:
+            d_ino: uint64
+            d_off: int64
+            d_reclen: uint16
+            d_type: byte
+            d_name: byte[256]
+    elif MACOS:
+        # This struct definition was a bit painful to find. One way to find it
+        # is to run C preprocessor on '#include <dirent.h>' in GitHub Actions.
+        assert not IS_32BIT
+        class dirent:
+            d_ino: uint64
+            d_seekoff: uint64
+            d_reclen: uint16
+            d_namlen: uint16
+            d_type: byte
+            d_name: byte[1024]
+    elif NETBSD:
+        class dirent:
+            d_fileno: uint64
+            d_reclen: uint16
+            d_namlen: uint16
+            d_type: byte
+            d_name: byte[512]
+    else:
+        assert False  # unsupported system
+
+    if NETBSD:
+        # On NetBSD, "opendir" and "readdir" are legacy functions.
+        # We can't use them because they generate a linker warning.
+        # The dirent.h header magically renames them at compile time to the following names.
+        declare __opendir30(name: byte*) -> DIR*
+        declare __readdir30(dirp: DIR*) -> dirent*
+        def opendir(name: byte*) -> DIR*:
+            return __opendir30(name)
+        def readdir(dirp: DIR*) -> dirent*:
+            return __readdir30(dirp)
+    else:
+        declare opendir(name: byte*) -> DIR*
+        declare readdir(dirp: DIR*) -> dirent*
+
+    declare closedir(dirp: DIR*) -> int
+
+
+# Iterating directory contents
+@public
+class DirIter:
+    # Inputs given by user
+    dir: byte*
+    include_dot_and_dotdot: bool
+
+    # Output
+    path: byte*
+    name: byte*
+
+    error_code: int  # errno or GetLastError
+    error_message: byte[512]
+
+    # Internal state
+    path_list: List[byte]
+    if WINDOWS:
+        handle: int64
+    else:
+        dir_ptr: DIR*
+
+    def set_name(self, name: byte*) -> bool:
+        if (not self.include_dot_and_dotdot) and (strcmp(name, ".") == 0 or strcmp(name, "..") == 0):
+            return False
+
+        self.path_list.len = 0
+        self.path_list.extend_from_ptr(self.dir, strlen(self.dir))
+
+        if WINDOWS:
+            if not (ends_with(self.dir, "/") or ends_with(self.dir, "\\")):
+                self.path_list.append('\\')
+        else:
+            if not ends_with(self.dir, "/"):
+                self.path_list.append('/')
+
+        name_start_index = self.path_list.len
+        self.path_list.extend_from_ptr(name, strlen(name))
+        self.path_list.append('\0')
+
+        self.path = self.path_list.ptr
+        self.name = &self.path[name_start_index]
+        return True
+
+    @public
+    def next(self) -> bool:
+        if self.dir == NULL:
+            return False
+
+        if WINDOWS:
+            if self.dir[0] == '\0':
+                # "\\*" would search the root of the current drive. Let's not do that.
+                self.error_code = ERROR_PATH_NOT_FOUND as int
+                FormatMessageA(
+                    FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
+                    NULL, self.error_code as uint32, 0, self.error_message, sizeof(self.error_message) as uint32, NULL,
+                )
+                return False
+
+            # Jou initializes everything to zero, but INVALID_HANDLE_VALUE is
+            # more appropriate for this. Windows never uses zero as a find
+            # handle value, so this is fine.
+            if self.handle == 0:
+                self.handle = INVALID_HANDLE_VALUE
+
+            find_data: WIN32_FIND_DATAA
+            if self.handle == INVALID_HANDLE_VALUE:
+                # First file
+                pattern: byte* = NULL
+                asprintf(&pattern, "%s\\*", self.dir)
+                assert pattern != NULL
+
+                self.handle = FindFirstFileA(pattern, &find_data)
+                free(pattern)
+                found = (self.handle != INVALID_HANDLE_VALUE)
+            else:
+                # Not first file
+                found = (FindNextFileA(self.handle, &find_data) != 0)
+
+            while found:
+                if self.set_name(find_data.cFileName):
+                    return True
+                found = (FindNextFileA(self.handle, &find_data) != 0)
+
+            e = GetLastError() as int
+
+            free(self.path_list.ptr)
+            if self.handle != INVALID_HANDLE_VALUE:
+                FindClose(self.handle)
+            *self = DirIter{}
+
+            if e != ERROR_NO_MORE_FILES:
+                # It failed
+                self.error_code = e
+                FormatMessageA(
+                    FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
+                    NULL, self.error_code as uint32, 0, self.error_message, sizeof(self.error_message) as uint32, NULL,
+                )
+
+            return False
+
+        else:
+            if self.dir_ptr == NULL:
+                # This is the first time this is called
+                self.dir_ptr = opendir(self.dir)
+                if self.dir_ptr == NULL:
+                    # It failed
+                    *self = DirIter{error_code = get_errno()}
+                    strerror_r(self.error_code, self.error_message, sizeof(self.error_message))
+                    return False
+
+            while True:
+                set_errno(0)
+                entry = readdir(self.dir_ptr)
+                if entry == NULL:
+                    # End of directory, or error reading directory
+                    e = get_errno()
+                    free(self.path_list.ptr)
+                    closedir(self.dir_ptr)
+                    *self = DirIter{}
+                    if e != 0:
+                        # It failed
+                        self.error_code = e
+                        strerror_r(e, self.error_message, sizeof(self.error_message))
+                    return False
+
+                if self.set_name(entry.d_name):
+                    return True
diff --git a/tests/data/folder_2files_1dir/file 2 with spaces.txt b/tests/data/folder_2files_1dir/file 2 with spaces.txt
@@ -0,0 +1 @@
+hello there
diff --git a/tests/data/folder_2files_1dir/file1.txt b/tests/data/folder_2files_1dir/file1.txt
@@ -0,0 +1 @@
+hi
diff --git a/tests/data/folder_2files_1dir/subdir/file3.txt b/tests/data/folder_2files_1dir/subdir/file3.txt
@@ -0,0 +1 @@
+this is a file
diff --git a/tests/should_succeed/fs_test.jou b/tests/should_succeed/fs_test.jou