@@ -260,6 +260,9 @@ def detect_filetype(
260260 if _is_text_file_a_json (file = file , filename = filename ):
261261 return FileType .JSON
262262
263+ if _is_text_file_a_csv (file = file , filename = filename ):
264+ return FileType .CSV
265+
263266 if file and not extension and _check_eml_from_buffer (file = file ) is True :
264267 return FileType .EML
265268
@@ -327,14 +330,12 @@ def _detect_filetype_from_octet_stream(file: IO) -> FileType:
327330 return FileType .UNK
328331
329332
330- def _is_text_file_a_json (
333+ def _read_file_start_for_type_check (
331334 filename : Optional [str ] = None ,
332- content_type : Optional [str ] = None ,
333335 file : Optional [IO ] = None ,
334- ):
335- """Detects if a file that has a text/plain MIME type is a JSON file ."""
336+ ) -> str :
337+ """Reads the start of the file and returns the text content ."""
336338 exactly_one (filename = filename , file = file )
337-
338339 if file is not None :
339340 file .seek (0 )
340341 file_content = file .read (4096 )
@@ -343,13 +344,37 @@ def _is_text_file_a_json(
343344 else :
344345 file_text = file_content .decode (errors = "ignore" )
345346 file .seek (0 )
346- elif filename is not None :
347+ if filename is not None :
347348 with open (filename ) as f :
348- file_text = f .read ()
349+ file_text = f .read (4096 )
350+ return file_text
351+
349352
353+ def _is_text_file_a_json (
354+ filename : Optional [str ] = None ,
355+ file : Optional [IO ] = None ,
356+ ):
357+ """Detects if a file that has a text/plain MIME type is a JSON file."""
358+ file_text = _read_file_start_for_type_check (file = file , filename = filename )
350359 return re .match (LIST_OF_DICTS_PATTERN , file_text ) is not None
351360
352361
362+ def _is_text_file_a_csv (
363+ filename : Optional [str ] = None ,
364+ file : Optional [IO ] = None ,
365+ ):
366+ """Detects if a file that has a text/plain MIME type is a CSV file."""
367+ file_text = _read_file_start_for_type_check (file = file , filename = filename )
368+ lines = file_text .strip ().splitlines ()
369+ if len (lines ) < 2 :
370+ return False
371+ lines = lines [: len (lines )] if len (lines ) < 10 else lines [:10 ]
372+ header = lines [0 ].split ("," )
373+ if any ("," not in line for line in lines ):
374+ return False
375+ return all (len (line .split ("," )) == len (header ) for line in lines [:- 1 ])
376+
377+
353378def _check_eml_from_buffer (file : IO ) -> bool :
354379 """Checks if a text/plain file is actually a .eml file. Uses a regex pattern to see if the
355380 start of the file matches the typical pattern for a .eml file."""
@@ -359,7 +384,6 @@ def _check_eml_from_buffer(file: IO) -> bool:
359384 file_head = file_content .decode ("utf-8" , errors = "ignore" )
360385 else :
361386 file_head = file_content
362-
363387 return EMAIL_HEAD_RE .match (file_head ) is not None
364388
365389
0 commit comments