Merge pull request #1 from kula-ai/patch_paragraph_initialize

matthewgani · web-flow · commit c913e1a38af4 · 2024-08-19T15:05:19.000+08:00
Fix text run splitting
diff --git a/.gitignore b/.gitignore
@@ -5,3 +5,4 @@ pkg/*
 doc/
 vendor/
 coverage/
+.idea
diff --git a/lib/docx/containers/paragraph.rb b/lib/docx/containers/paragraph.rb
@@ -8,19 +8,90 @@ class Paragraph
         include Container
         include Elements::Element
 
+        PLACEHOLDER_REGEX = /\{\{(.*?)\}\}/ # In order to combine text runs with {{}} pattern
+
         def self.tag
           'p'
         end
 
 
         # Child elements: pPr, r, fldSimple, hlink, subDoc
         # http://msdn.microsoft.com/en-us/library/office/ee364458(v=office.11).aspx
+        #
+        # See: https://github.com/ruby-docx/docx/issues/147 for placeholder patching
         def initialize(node, document_properties = {}, doc = nil)
           @node = node
           @properties_tag = 'pPr'
           @document_properties = document_properties
           @font_size = @document_properties[:font_size]
           @document = doc
+          validate_placeholder_content
+        end
+
+        # This method detects and replaces the corrupted nodes if any exists.
+        def validate_placeholder_content
+          placeholder_position_hash = detect_placeholder_positions
+          content_size = [0]
+          text_runs.each_with_index do |text_node, index|
+            content_size[index + 1] = text_node.text.length + (index.zero? ? 0 : content_size[index])
+          end
+          content_size.pop
+          placeholder_position_hash.each do |placeholder, placeholder_positions|
+            placeholder_positions.each do |p_start_index|
+              p_end_index = (p_start_index + placeholder.length - 1)
+              tn_start_index = content_size.index(content_size.select { |size| size <= p_start_index }.max)
+              tn_end_index = content_size.index(content_size.select { |size| size <= p_end_index }.max)
+              next if tn_start_index == tn_end_index
+              replace_incorrect_placeholder_content(placeholder, tn_start_index, tn_end_index, content_size[tn_start_index] - p_start_index, p_end_index - content_size[tn_end_index])
+            end
+          end
+        end
+
+        # This method detect the placeholder's starting index and return the starting index in array.
+        # Ex: Assumptions : text = 'This is Placeholder Text with {{Placeholder}} {{Text}} {{Placeholder}}'
+        #     It will detect the placeholder's starting index from the given text.
+        #     Here, starting index of '{{Placeholder}}' => [30, 55], '{{Text}}' => [46]
+        # @return [Hash]
+        # Ex: {'{{Placeholder}}' => [30, 55], '{{Text}}' => [46]}
+        def detect_placeholder_positions
+          text.scan(PLACEHOLDER_REGEX).flatten.uniq.each_with_object({}) do |placeholder, placeholder_hash|
+            next if placeholder.include?("{") || placeholder.include?("}")
+            placeholder_text = "{{#{placeholder}}}"
+            current_index = text.index(placeholder_text)
+            arr_of_index = [current_index]
+            until current_index.nil?
+              current_index = text.index(placeholder_text, current_index + 1)
+              arr_of_index << current_index unless current_index.nil?
+            end
+            placeholder_hash[placeholder_text] = arr_of_index
+          end
+        end
+
+        # @param [String] :placeholder
+        # @param [Integer] :start_index, end_index, p_start_index, p_end_index
+        # This Method replaces below :
+        #   1. Corrupted text nodes content with empty string
+        #   2. Proper Placeholder content within the same text node
+        # Ex: Assume we have a array of text nodes content as text_runs = ['This is ', 'Placeh', 'older Text', 'with ', '{{', 'Place', 'holder}}' , '{{Text}}', '{{Placeholder}}']
+        #   Here if you see, the '{{placeholder}}' is not available in the same text node. We need to merge the content of indexes - text_runs[5], text_runs[6], text_runs[7].
+        #   So We will replace the content as below:
+        #     1. text_runs[5] = '{{Placeholder}}'
+        #     2. text_runs[6] = ''
+        #     3. text_runs[7] = ''
+        def replace_incorrect_placeholder_content(placeholder, start_index, end_index, p_start_index, p_end_index)
+          (start_index..end_index).each do |index|
+            if index == start_index
+              current_text = text_runs[index].text.to_s
+              current_text[p_start_index..-1] = placeholder
+              text_runs[index].text = current_text
+            elsif index == end_index
+              current_text = text_runs[index].text.to_s
+              current_text[0..p_end_index] = ""
+              text_runs[index].text = current_text
+            else
+              text_runs[index].text = ""
+            end
+          end
         end
 
         # Set text of paragraph

-Original file line number
+Diff line change
 doc/
 vendor/
 coverage/
 +.idea