Add a type field to the crawl results.

tooryx · copybara-github · commit eb87c7e45767 · 2026-02-12T02:30:52.000-08:00
Currently, Tsunami stores the HTTP response directly in the crawl results. This new field provides supports several storage strategy. More specifically, in Goonami we only store a hash of the page for efficiency.

PiperOrigin-RevId: 869109670
Change-Id: I1cc03dc60006090c5c60319e505e941aba4da697
diff --git a/proto/web_crawl.proto b/proto/web_crawl.proto
@@ -82,6 +82,13 @@ message HttpHeader {
   string value = 2;
 }
 
+// The type of content stored in the CrawlResult.
+enum CrawlContentType {
+  CONTENT_TYPE_UNSPECIFIED = 0;
+  CONTENT_TYPE_RAW = 1;
+  CONTENT_TYPE_HASH = 2;
+}
+
 message CrawlResult {
   // The target visited by the crawler.
   CrawlTarget crawl_target = 1;
@@ -100,4 +107,9 @@ message CrawlResult {
 
   // Http headers of the response
   repeated HttpHeader response_headers = 6;
+
+  // The type of content stored in the crawl_results. By default, the whole
+  // response body is stored (RAW). But some configuration can request storing
+  // only a hash of the response body (HASH).
+  CrawlContentType crawl_content_type = 7;
 }