|
31 | 31 | (update 0 (fn [s] (.substring ^String s (dec c1))))
|
32 | 32 | (update (dec (count lines)) (fn [s] (.substring ^String s 0 (- c2 c1))))))))
|
33 | 33 |
|
| 34 | +(defn parse-h1 [s] |
| 35 | + (when s |
| 36 | + (let [[t1 t2 & tokens :as h1] (string/split (string/trim s) #"\s+")] |
| 37 | + (cond |
| 38 | + ;; prompt is the first token |
| 39 | + (and t1 (= "prompt" (string/lower-case t1))) |
| 40 | + (merge |
| 41 | + {:role (or (#{"user" "system"} t2) "user")} |
| 42 | + (let [v (concat (if (#{"user" "system"} t2) [] (when t2 [t2])) tokens)] |
| 43 | + (when (seq v) |
| 44 | + {:title (string/join " " v)}))) |
| 45 | + |
| 46 | + ;; prompt is the second token |
| 47 | + (and t2 (= "prompt" (string/lower-case t2))) |
| 48 | + (merge |
| 49 | + {:role t1} |
| 50 | + (when (seq tokens) {:title (string/join " " tokens)})))))) |
| 51 | + |
34 | 52 | (def prompt-pattern-with-role-capture #"(?i)\s*prompt\s+(\w+)\s?")
|
35 | 53 | (def prompt-pattern #"(?i)\s*prompt\s?(\w+)?\s?")
|
36 | 54 |
|
37 | 55 | (defn extract-role [s]
|
38 |
| - (second |
39 |
| - (re-find prompt-pattern-with-role-capture s))) |
| 56 | + (:role (parse-h1 s))) |
40 | 57 |
|
41 | 58 | ;; headings that include the word Prompt
|
42 | 59 | (defn prompt-section? [content node]
|
43 |
| - (re-matches |
44 |
| - prompt-pattern |
45 |
| - (-> node (nth 2) (nth 3) (nth 1) (from-range content)))) |
| 60 | + (parse-h1 (-> node (nth 2) (nth 3) (nth 1) (from-range content)))) |
46 | 61 |
|
47 | 62 | (defn remove-first-line [s]
|
48 | 63 | (->> (string/split s #"\n")
|
|
51 | 66 |
|
52 | 67 | ;; extract Role from Prompt ....
|
53 | 68 | (defn node-content [content node]
|
54 |
| - {:role |
55 |
| - (or (-> node (nth 2) (nth 3) (nth 1) (from-range content) (extract-role)) "user") |
56 |
| - :content |
57 |
| - (remove-first-line (from-range (nth node 1) content))}) |
58 |
| - |
59 |
| -(comment |
60 |
| - (extract-role "prompt user") |
61 |
| - (extract-role "prompt") |
62 |
| - (extract-role "prompt user and more") |
63 |
| - (re-matches prompt-pattern "prompt") |
64 |
| - (re-matches prompt-pattern "prompt user") |
65 |
| - (re-matches prompt-pattern "prompt user")) |
| 69 | + (merge |
| 70 | + (-> node (nth 2) (nth 3) (nth 1) (from-range content) (parse-h1)) |
| 71 | + ;; TODO merge a description and filter node content |
| 72 | + {:content |
| 73 | + (remove-first-line (from-range (nth node 1) content))})) |
| 74 | + |
| 75 | +(defn section? [node] |
| 76 | + (and (list? node) (= "section" (first node)))) |
| 77 | + |
| 78 | +(defn description-section? [content node] |
| 79 | + (when-let [atx-header-node (first (filter #(= "atx_heading" (first %)) node))] |
| 80 | + (println "trimmed " (string/trim (from-range (-> atx-header-node (nth 3) (nth 1)) content))) |
| 81 | + (= "description" (string/trim (from-range (-> atx-header-node (nth 3) (nth 1)) content))))) |
| 82 | + |
| 83 | +(defn atx-heading-section? [node] |
| 84 | + (= "atx_heading" (first node))) |
| 85 | + |
| 86 | +(defn remove-section-content [content s node] |
| 87 | + (if (and (list? node) (= "atx_heading" (first node))) |
| 88 | + (string/replace s (from-range (nth node 1) content) "") |
| 89 | + s)) |
| 90 | + |
| 91 | +(defn section-content-without-headings [content node] |
| 92 | + (reduce |
| 93 | + (partial remove-section-content content) |
| 94 | + (from-range (nth node 1) content) |
| 95 | + (seq node))) |
| 96 | + |
| 97 | +(defn h1-prompt-content [content node] |
| 98 | + (merge |
| 99 | + (-> node (nth 2) (nth 3) (nth 1) (from-range content) (parse-h1)) |
| 100 | + ;; TODO merge a description and filter node content |
| 101 | + (if (some section? node) |
| 102 | + (merge |
| 103 | + {:content (->> node |
| 104 | + (filter section?) |
| 105 | + (filter (complement (partial description-section? content))) |
| 106 | + #_(filter (complement atx-heading-section?)) |
| 107 | + (map (partial section-content-without-headings content)) |
| 108 | + (apply str) |
| 109 | + (string/trim))} |
| 110 | + (when-let [description (->> node |
| 111 | + (filter section?) |
| 112 | + (filter (partial description-section? content)) |
| 113 | + first |
| 114 | + (section-content-without-headings content) |
| 115 | + (string/trim))] |
| 116 | + {:description description})) |
| 117 | + {:content (string/trim (section-content-without-headings content node))}))) |
| 118 | + |
| 119 | +(def heading-1-loc->top-level-section-node (comp zip/node zip/up zip/up)) |
| 120 | + |
| 121 | +(defn extract-prompts-with-descriptions [content ast] |
| 122 | + (->> |
| 123 | + (iterate zip/next (zip/seq-zip ast)) |
| 124 | + (take-while (complement zip/end?)) |
| 125 | + (filter heading-1-section?) |
| 126 | + (map heading-1-loc->top-level-section-node) |
| 127 | + (filter (partial prompt-section? content)) |
| 128 | + (map (partial h1-prompt-content content)))) |
66 | 129 |
|
67 | 130 | (defn extract-prompts [content ast]
|
68 | 131 | (->>
|
|
101 | 164 | (defn extract-first-comment [content ast]
|
102 | 165 | (try
|
103 | 166 | (when-let [loc (->>
|
104 |
| - (iterate zip/next (zip/seq-zip ast)) |
105 |
| - (take-while (complement zip/end?)) |
106 |
| - (some (fn [loc] (when (html-comment? loc) loc))))] |
| 167 | + (iterate zip/next (zip/seq-zip ast)) |
| 168 | + (take-while (complement zip/end?)) |
| 169 | + (some (fn [loc] (when (html-comment? loc) loc))))] |
107 | 170 | (->
|
108 |
| - (from-range (-> loc zip/node second) content) |
109 |
| - (remove-markers) |
110 |
| - (clj-yaml/parse-string))) |
| 171 | + (from-range (-> loc zip/node second) content) |
| 172 | + (remove-markers) |
| 173 | + (clj-yaml/parse-string))) |
111 | 174 | (catch Throwable ex
|
112 | 175 | (println ex)
|
113 | 176 | nil)))
|
114 | 177 |
|
115 |
| -(defn parse-new [content query] |
116 |
| - (let [content (str content "\n# END\n\n") |
117 |
| - x (docker/function-call-with-stdin |
118 |
| - {:image "vonwig/tree-sitter:latest" |
119 |
| - :content content |
120 |
| - :command (concat |
121 |
| - ["-lang" "markdown"] |
122 |
| - ["-query" query])}) |
123 |
| - {s :pty-output} (async/<!! (async/thread |
124 |
| - (Thread/sleep 10) |
125 |
| - (docker/finish-call x)))] |
126 |
| - (->> s))) |
127 |
| - |
128 |
| -(comment |
129 |
| - ; TODO - migrate to tree-sitter queries but can we express this with tree-sitter |
130 |
| - (parse-new (slurp "./tprompt1.md") "(document) @doc") |
131 |
| - (json/parse-string (parse-new (slurp "./tprompt1.md") "(document (minus_metadata) @doc)")) |
132 |
| - (json/parse-string (parse-new (slurp "./tprompt1.md") "(document (section (html_block) @html))")) |
133 |
| - (json/parse-string (parse-new (slurp "./tprompt1.md") "(document (section (atx_heading (atx_h1_marker)))* @top-section)"))) |
134 |
| - |
135 | 178 | (defn parse-markdown
|
136 | 179 | "use the custom sexp representation"
|
137 | 180 | [content]
|
|
144 | 187 | (->> (edn/read-string s))))
|
145 | 188 |
|
146 | 189 | (defn parse-prompts
|
147 |
| - "parse out the h1 prompt sections" |
| 190 | + "parse out the h1 prompt sections and the metadata" |
148 | 191 | [content]
|
149 | 192 | (let [content (str content "\n# END\n\n")
|
150 | 193 | ast (parse-markdown content)]
|
151 | 194 | {:messages
|
152 | 195 | (->> ast
|
153 |
| - (extract-prompts content) |
| 196 | + (extract-prompts-with-descriptions content) |
154 | 197 | (into []))
|
155 |
| - :metadata (or |
| 198 | + :metadata (or |
156 | 199 | (extract-metadata content ast)
|
157 | 200 | (extract-first-comment content ast)
|
158 | 201 | {})}))
|
159 | 202 |
|
160 |
| -(comment |
161 |
| - (parse-prompts (slurp "./broken.md")) |
162 |
| - ) |
| 203 | +;; ---------- future --------- |
163 | 204 |
|
164 |
| -(comment |
165 |
| - ; inline same line !,[,],(,) in that order after filtering out other irrelevant things |
166 |
| - ; ^ those are imgages and the content between the [ ] should be put into a separate message |
167 |
| - ; the first minus_metadata block of the doc |
168 |
| - ; the first html_block section that ends with --> |
169 |
| - ; get content and then check of --- --- pre-amble |
170 |
| - ; then try to parse the yaml out of that |
171 |
| - (parse-markdown (slurp "./tprompt2.md")) |
172 |
| - (parse-prompts (slurp "./tprompt1.md")) |
173 |
| - (parse-prompts (slurp "./tprompt2.md")) |
174 |
| - ) |
| 205 | +(defn parse-new [content query] |
| 206 | + (let [content (str content "\n# END\n\n") |
| 207 | + x (docker/function-call-with-stdin |
| 208 | + {:image "vonwig/tree-sitter:latest" |
| 209 | + :content content |
| 210 | + :command (concat |
| 211 | + ["-lang" "markdown"] |
| 212 | + ["-query" query])}) |
| 213 | + {s :pty-output} (async/<!! (async/thread |
| 214 | + (Thread/sleep 10) |
| 215 | + (docker/finish-call x)))] |
| 216 | + (->> s))) |
175 | 217 |
|
176 | 218 | (comment
|
177 |
| - (string/split content #"\n") |
178 |
| - |
179 |
| - (def content (slurp "prompts/qrencode/README.md")) |
180 |
| - (pprint (parse-markdown content)) |
181 |
| - |
182 |
| - (parse-markdown (slurp "prompts/pylint/docs.md")) |
183 |
| - (parse-markdown (slurp "prompts/pylint/4-run-violation-insert.md")) |
184 |
| - |
185 |
| - (def t |
186 |
| - '("section" |
187 |
| - "4:1-8:1" |
188 |
| - ("atx_heading" |
189 |
| - "4:1-5:1" |
190 |
| - ("atx_h1_marker" "4:1-4:2") |
191 |
| - ("inline" "4:3-4:13")) |
192 |
| - ("paragraph" |
193 |
| - "6:1-7:1" |
194 |
| - ("inline" "6:1-6:34" ("," "6:26-6:27") ("?" "6:33-6:34"))))) |
195 |
| - |
196 |
| - (def r "4:3-4:13") |
197 |
| - (prompt-section? "" t) |
198 |
| - (from-range "4:3-4:13" content) |
199 |
| - (from-range "8:1-14:1" content)) |
| 219 | + ; TODO - migrate to tree-sitter queries but can we express this with tree-sitter |
| 220 | + (parse-new (slurp "./tprompt1.md") "(document) @doc") |
| 221 | + (json/parse-string (parse-new (slurp "./tprompt1.md") "(document (minus_metadata) @doc)")) |
| 222 | + (json/parse-string (parse-new (slurp "./tprompt1.md") "(document (section (html_block) @html))")) |
| 223 | + (json/parse-string (parse-new (slurp "./tprompt1.md") "(document (section (atx_heading (atx_h1_marker)))* @top-section)"))) |
| 224 | + |
0 commit comments