speed up the parsing of JSON files when using unformatted stream by reading data in chunks rather than one character at a time. See #363

jacobwilliams · jacobwilliams · commit fa245f263770 · 2018-09-22T23:59:27.000-05:00
I don't think pop_char doesn't need to be recursive.
diff --git a/src/json_parameters.F90 b/src/json_parameters.F90
@@ -124,6 +124,8 @@ module json_parameters
 
     integer(IK),parameter :: seq_chunk_size = 256_IK !! chunk size for reading sequential files
 
+    integer(IK),parameter :: stream_chunk_size = 1000_IK!! chunk size for reading stream files
+
     integer(IK),parameter :: pushed_char_size = 10_IK !! size for `pushed_char`
                                                       !! array in [[json_core(type)]]
 
diff --git a/src/json_string_utilities.F90 b/src/json_string_utilities.F90
@@ -125,14 +125,17 @@ subroutine string_to_integer(str,ival,status_ok)
 
     ! Compute how many digits we need to read
     ndigits = 2*len_trim(str)
-    ndigits_digits = floor(log10(real(ndigits)))+1
-    allocate(character(kind=CDK,len=ndigits_digits) :: digits)
-    write(digits,'(I0)') ndigits !gfortran will have a runtime error with * edit descriptor here
-    ! gfortran bug: '*' edit descriptor for ISO_10646 strings does bad stuff.
-    read(str,'(I'//trim(digits)//')',iostat=ierr) ival   !string to integer
-
-    ! error check:
-    status_ok = (ierr==0)
+    if (ndigits/=0) then
+        ndigits_digits = floor(log10(real(ndigits)))+1
+        allocate(character(kind=CDK,len=ndigits_digits) :: digits)
+        write(digits,'(I0)') ndigits !gfortran will have a runtime error with * edit descriptor here
+        ! gfortran bug: '*' edit descriptor for ISO_10646 strings does bad stuff.
+        read(str,'(I'//trim(digits)//')',iostat=ierr) ival   !string to integer
+        ! error check:
+        status_ok = (ierr==0)
+    else
+        status_ok = .false.
+    end if
     if (.not. status_ok) ival = 0_IK
 
     end subroutine string_to_integer
diff --git a/src/json_value_module.F90 b/src/json_value_module.F90
@@ -256,6 +256,11 @@ module json_value_module
                                                   !! (both escaped and unescaped versions are still
                                                   !! valid in all cases).
 
+        integer :: ichunk = 0 !! index in `chunk` for [[pop_char]]
+                              !! when `use_unformatted_stream=True`
+        character(kind=CK,len=stream_chunk_size) :: chunk = CK_'' !! a chunk read from a stream file
+                                                                  !! when `use_unformatted_stream=True`
+
         contains
 
         private
@@ -917,6 +922,8 @@ subroutine json_initialize(me,verbose,compact_reals,&
     me%char_count   = 0
     me%line_count   = 1
     me%ipos         = 1
+    me%ichunk       = 0
+    me%chunk        = ''
 
 #ifdef USE_UCS4
     ! reopen stdout and stderr with utf-8 encoding
@@ -10147,7 +10154,7 @@ end subroutine parse_number
 !@note This routine ignores non-printing ASCII characters
 !      (`iachar<=31`) that are in strings.
 
-    recursive subroutine pop_char(json,unit,str,skip_ws,skip_comments,eof,popped)
+    subroutine pop_char(json,unit,str,skip_ws,skip_comments,eof,popped)
 
     implicit none
 
@@ -10170,6 +10177,10 @@ recursive subroutine pop_char(json,unit,str,skip_ws,skip_comments,eof,popped)
     logical(LK)              :: parsing_comment !! if we are in the process
                                                 !! of parsing a comment line
 
+    logical,parameter :: chunk_it = .true. !! if true, stream files are read in chunks,
+                                           !! rather than one character at a time.
+                                           !! this speeds up the parsing dramatically.
+
     if (.not. json%exception_thrown) then
 
         eof = .false.
@@ -10201,16 +10212,37 @@ recursive subroutine pop_char(json,unit,str,skip_ws,skip_comments,eof,popped)
 
                     !read the next character:
                     if (use_unformatted_stream) then
-                        read(unit=unit,pos=json%ipos,iostat=ios) c
+
+                        if (chunk_it) then
+                            ! in this case, we read the file in chunks.
+                            ! if we already have the character we need,
+                            ! then get it from the chunk. Otherwise,
+                            ! read another chunk
+
+                            if (json%ichunk<1) then
+                                json%ichunk = 0
+                                read(unit=unit,pos=json%ipos,iostat=ios) json%chunk
+                            else
+                                ios = 0
+                            end if
+                            json%ichunk = json%ichunk + 1
+                            c = json%chunk(json%ichunk:json%ichunk)
+                            if (json%ichunk==len(json%chunk)) then
+                                json%ichunk = 0 ! reset
+                            else
+                                ! we have to finish getting
+                                ! characters from this chunk:
+                                if (IS_IOSTAT_END(ios)) ios = 0
+                            end if
+                        else
+                            read(unit=unit,pos=json%ipos,iostat=ios) c
+                        end if
+
                     else
                         read(unit=unit,fmt='(A1)',advance='NO',iostat=ios) c
                     end if
                     json%ipos = json%ipos + 1
 
-                    !....note: maybe try read the file in chunks...
-                    !.... or use asynchronous read with double buffering
-                    !     (see Modern Fortran: Style and Usage)
-
                 else    !read from the string
 
                     str_len = len(str)   !length of the string
@@ -10302,6 +10334,7 @@ subroutine push_char(json,c)
             !in this case, c is ignored, and we just
             !decrement the stream position counter:
             json%ipos = json%ipos - 1
+            json%ichunk = json%ichunk - 1
 
         else