1- """Define the schema for the filesystem representation."""
1+ """Define the schema for the filesystem representation.
2+
3+ Memory optimization:
4+ - Lazy loading: File content is only loaded when the content property is accessed
5+ - Content caching: Content is cached to avoid repeated file reads
6+ - Cache clearing: The clear_content_cache method allows freeing memory when content is no longer needed
7+ - Chunked reading: Large files are read in chunks to avoid loading everything at once
8+ """
29
310from __future__ import annotations
411
@@ -49,6 +56,7 @@ class FileSystemNode: # pylint: disable=too-many-instance-attributes
4956 dir_count : int = 0
5057 depth : int = 0
5158 children : list [FileSystemNode ] = field (default_factory = list )
59+ _content_cache : str | None = field (default = None , repr = False )
5260
5361 def sort_children (self ) -> None :
5462 """Sort the children nodes of a directory according to a specific order.
@@ -83,6 +91,18 @@ def _sort_key(child: FileSystemNode) -> tuple[int, str]:
8391
8492 self .children .sort (key = _sort_key )
8593
94+ def clear_content_cache (self ) -> None :
95+ """Clear the cached content to free up memory.
96+
97+ This method clears the content cache of this node and all its children recursively,
98+ allowing the garbage collector to reclaim memory used by file contents.
99+ """
100+ self ._content_cache = None
101+
102+ # Recursively clear cache for all children
103+ for child in self .children :
104+ child .clear_content_cache ()
105+
86106 @property
87107 def content_string (self ) -> str :
88108 """Return the content of the node as a string, including path and content.
@@ -104,12 +124,15 @@ def content_string(self) -> str:
104124 return "\n " .join (parts ) + "\n \n "
105125
106126 @property
107- def content (self ) -> str : # pylint: disable=too-many-return-statements
127+ def content (self ) -> str : # pylint: disable=too-many-return-statements,too-many-branches # noqa: C901,PLR0912
108128 """Return file content (if text / notebook) or an explanatory placeholder.
109129
110130 Heuristically decides whether the file is text or binary by decoding a small chunk of the file
111131 with multiple encodings and checking for common binary markers.
112132
133+ Uses lazy loading to avoid loading the entire file into memory until needed,
134+ and caches the result to avoid repeated file reads.
135+
113136 Returns
114137 -------
115138 str
@@ -121,29 +144,40 @@ def content(self) -> str: # pylint: disable=too-many-return-statements
121144 If the node is a directory.
122145
123146 """
147+ # Return cached content if available
148+ if self ._content_cache is not None :
149+ return self ._content_cache
150+
124151 if self .type == FileSystemNodeType .DIRECTORY :
125152 msg = "Cannot read content of a directory node"
126153 raise ValueError (msg )
127154
128155 if self .type == FileSystemNodeType .SYMLINK :
129- return "" # TODO: are we including the empty content of symlinks?
156+ self ._content_cache = "" # TODO: are we including the empty content of symlinks?
157+ return self ._content_cache
130158
131159 if self .path .suffix == ".ipynb" : # Notebook
132160 try :
133- return process_notebook (self .path )
161+ self . _content_cache = process_notebook (self .path )
134162 except Exception as exc :
135- return f"Error processing notebook: { exc } "
163+ self ._content_cache = f"Error processing notebook: { exc } "
164+ else :
165+ return self ._content_cache
166+ return self ._content_cache
136167
137168 chunk = _read_chunk (self .path )
138169
139170 if chunk is None :
140- return "Error reading file"
171+ self ._content_cache = "Error reading file"
172+ return self ._content_cache
141173
142174 if chunk == b"" :
143- return "[Empty file]"
175+ self ._content_cache = "[Empty file]"
176+ return self ._content_cache
144177
145178 if not _decodes (chunk , "utf-8" ):
146- return "[Binary file]"
179+ self ._content_cache = "[Binary file]"
180+ return self ._content_cache
147181
148182 # Find the first encoding that decodes the sample
149183 good_enc : str | None = next (
@@ -152,10 +186,24 @@ def content(self) -> str: # pylint: disable=too-many-return-statements
152186 )
153187
154188 if good_enc is None :
155- return "Error: Unable to decode file with available encodings"
189+ self ._content_cache = "Error: Unable to decode file with available encodings"
190+ return self ._content_cache
156191
157192 try :
193+ # Read file in chunks to avoid loading large files entirely into memory
194+ content_chunks = []
195+ chunk_size = 1024 * 1024 # 1MB chunks
196+
158197 with self .path .open (encoding = good_enc ) as fp :
159- return fp .read ()
198+ while True :
199+ chunk = fp .read (chunk_size )
200+ if not chunk :
201+ break
202+ content_chunks .append (chunk )
203+
204+ self ._content_cache = "" .join (content_chunks )
160205 except (OSError , UnicodeDecodeError ) as exc :
161- return f"Error reading file with { good_enc !r} : { exc } "
206+ self ._content_cache = f"Error reading file with { good_enc !r} : { exc } "
207+ else :
208+ return self ._content_cache
209+ return self ._content_cache
0 commit comments