|
1 | 1 | (ns markdown
|
2 | 2 | (:require
|
| 3 | + [cheshire.core :as json] |
| 4 | + [clj-yaml.core :as clj-yaml] |
3 | 5 | [clojure.core.async :as async]
|
4 | 6 | [clojure.edn :as edn]
|
5 | 7 | [clojure.pprint :refer [pprint]]
|
|
61 | 63 | (filter (partial prompt-section? content))
|
62 | 64 | (map (partial node-content content))))
|
63 | 65 |
|
64 |
| -(defn parse-markdown [content] |
| 66 | +(defn metadata-section? [loc] |
| 67 | + (= "minus_metadata" (-> loc (zip/node) first))) |
| 68 | + |
| 69 | +(defn remove-markers [s] |
| 70 | + (and s (when-let [[_ x] (re-find (re-pattern "(?sm).*---(.*)---.*") s)] x))) |
| 71 | + |
| 72 | +(defn extract-metadata [content ast] |
| 73 | + (try |
| 74 | + (when-let [loc (->> |
| 75 | + (iterate zip/next (zip/seq-zip ast)) |
| 76 | + (take-while (complement zip/end?)) |
| 77 | + (some (fn [loc] (when (metadata-section? loc) loc))))] |
| 78 | + (-> |
| 79 | + (from-range (-> loc zip/node second) content) |
| 80 | + (remove-markers) |
| 81 | + (clj-yaml/parse-string))) |
| 82 | + (catch Throwable _ nil))) |
| 83 | + |
| 84 | +(defn html-comment? [loc] |
| 85 | + (and |
| 86 | + (= "html_block" (-> loc (zip/node) first)) |
| 87 | + (= "-->" (-> loc (zip/children) last first)))) |
| 88 | + |
| 89 | +(defn extract-first-comment [content ast] |
| 90 | + (try |
| 91 | + (when-let [loc (->> |
| 92 | + (iterate zip/next (zip/seq-zip ast)) |
| 93 | + (take-while (complement zip/end?)) |
| 94 | + (some (fn [loc] (when (html-comment? loc) loc))))] |
| 95 | + (-> |
| 96 | + (from-range (-> loc zip/node second) content) |
| 97 | + (remove-markers) |
| 98 | + (clj-yaml/parse-string))) |
| 99 | + (catch Throwable ex |
| 100 | + (println ex) |
| 101 | + nil))) |
| 102 | + |
| 103 | +(defn parse-new [content query] |
65 | 104 | (let [content (str content "\n# END\n\n")
|
66 | 105 | x (docker/function-call-with-stdin
|
| 106 | + {:image "vonwig/tree-sitter:latest" |
| 107 | + :content content |
| 108 | + :command (concat |
| 109 | + ["-lang" "markdown"] |
| 110 | + ["-query" query])}) |
| 111 | + {s :pty-output} (async/<!! (async/thread |
| 112 | + (Thread/sleep 10) |
| 113 | + (docker/finish-call x)))] |
| 114 | + (->> s))) |
| 115 | + |
| 116 | +(comment |
| 117 | + ; TODO - migrate to tree-sitter queries but can we express this with tree-sitter |
| 118 | + (parse-new (slurp "./tprompt1.md") "(document) @doc") |
| 119 | + (json/parse-string (parse-new (slurp "./tprompt1.md") "(document (minus_metadata) @doc)")) |
| 120 | + (json/parse-string (parse-new (slurp "./tprompt1.md") "(document (section (html_block) @html))")) |
| 121 | + (json/parse-string (parse-new (slurp "./tprompt1.md") "(document (section (atx_heading (atx_h1_marker)))* @top-section)"))) |
| 122 | + |
| 123 | +(defn parse-markdown |
| 124 | + "use the custom sexp representation" |
| 125 | + [content] |
| 126 | + (let [x (docker/function-call-with-stdin |
67 | 127 | {:image "docker/lsp:treesitter"
|
68 | 128 | :content content})
|
69 | 129 | {s :pty-output} (async/<!! (async/thread
|
70 | 130 | (Thread/sleep 10)
|
71 | 131 | (docker/finish-call x)))]
|
72 |
| - (->> s |
73 |
| - (edn/read-string) |
74 |
| - (extract-prompts content) |
75 |
| - (into [])))) |
| 132 | + (->> (edn/read-string s)))) |
| 133 | + |
| 134 | +(defn parse-prompts |
| 135 | + "parse out the h1 prompt sections" |
| 136 | + [content] |
| 137 | + (let [content (str content "\n# END\n\n") |
| 138 | + ast (parse-markdown content)] |
| 139 | + {:messages |
| 140 | + (->> ast |
| 141 | + (extract-prompts content) |
| 142 | + (into [])) |
| 143 | + :metadata (or |
| 144 | + (extract-metadata content ast) |
| 145 | + (extract-first-comment content ast)) })) |
| 146 | + |
| 147 | +(comment |
| 148 | + ; inline same line !,[,],(,) in that order after filtering out other irrelevant things |
| 149 | + ; ^ those are imgages and the content between the [ ] should be put into a separate message |
| 150 | + ; the first minus_metadata block of the doc |
| 151 | + ; the first html_block section that ends with --> |
| 152 | + ; get content and then check of --- --- pre-amble |
| 153 | + ; then try to parse the yaml out of that |
| 154 | + (parse-markdown (slurp "./tprompt2.md")) |
| 155 | + (parse-prompts (slurp "./tprompt1.md")) |
| 156 | + (parse-prompts (slurp "./tprompt2.md")) |
| 157 | + ) |
76 | 158 |
|
77 | 159 | (comment
|
78 | 160 | (string/split content #"\n")
|
|
0 commit comments