opensource-observer
diff --git a/‎.env.example‎
Lines changed: 25 additions & 23 deletions b/‎.env.example‎
Lines changed: 25 additions & 23 deletions
diff --git a/‎apps/docs/docs/contribute-data/api-crawling/index.md‎
Lines changed: 0 additions & 18 deletions b/‎apps/docs/docs/contribute-data/api-crawling/index.md‎
Lines changed: 0 additions & 18 deletions
diff --git a/‎apps/docs/docs/contribute-data/bigquery.md‎
Lines changed: 0 additions & 84 deletions b/‎apps/docs/docs/contribute-data/bigquery.md‎
Lines changed: 0 additions & 84 deletions
diff --git a/‎apps/docs/docs/contribute-data/bq-data-transfer.md‎
Lines changed: 2 additions & 1 deletion b/‎apps/docs/docs/contribute-data/bq-data-transfer.md‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎…data/api-crawling/crawl-api-advanced.png‎ ‎…s/contribute-data/crawl-api-advanced.png‎apps/docs/docs/contribute-data/api-crawling/crawl-api-advanced.png renamed to apps/docs/docs/contribute-data/crawl-api-advanced.png b/‎…data/api-crawling/crawl-api-advanced.png‎ ‎…s/contribute-data/crawl-api-advanced.png‎apps/docs/docs/contribute-data/api-crawling/crawl-api-advanced.png renamed to apps/docs/docs/contribute-data/crawl-api-advanced.png
diff --git a/‎…crawling/crawl-api-example-defillama.png‎ ‎…ute-data/crawl-api-example-defillama.png‎apps/docs/docs/contribute-data/api-crawling/crawl-api-example-defillama.png renamed to apps/docs/docs/contribute-data/crawl-api-example-defillama.png b/‎…crawling/crawl-api-example-defillama.png‎ ‎…ute-data/crawl-api-example-defillama.png‎apps/docs/docs/contribute-data/api-crawling/crawl-api-example-defillama.png renamed to apps/docs/docs/contribute-data/crawl-api-example-defillama.png
diff --git a/‎…ing/crawl-api-example-opencollective.png‎ ‎…ata/crawl-api-example-opencollective.png‎apps/docs/docs/contribute-data/api-crawling/crawl-api-example-opencollective.png renamed to apps/docs/docs/contribute-data/crawl-api-example-opencollective.png b/‎…ing/crawl-api-example-opencollective.png‎ ‎…ata/crawl-api-example-opencollective.png‎apps/docs/docs/contribute-data/api-crawling/crawl-api-example-opencollective.png renamed to apps/docs/docs/contribute-data/crawl-api-example-opencollective.png
diff --git a/‎…-crawling/crawl-api-graphql-pipeline.png‎ ‎…bute-data/crawl-api-graphql-pipeline.png‎apps/docs/docs/contribute-data/api-crawling/crawl-api-graphql-pipeline.png renamed to apps/docs/docs/contribute-data/crawl-api-graphql-pipeline.png b/‎…-crawling/crawl-api-graphql-pipeline.png‎ ‎…bute-data/crawl-api-graphql-pipeline.png‎apps/docs/docs/contribute-data/api-crawling/crawl-api-graphql-pipeline.png renamed to apps/docs/docs/contribute-data/crawl-api-graphql-pipeline.png
diff --git a/‎apps/docs/docs/contribute-data/dagster.md‎
Lines changed: 3 additions & 2 deletions b/‎apps/docs/docs/contribute-data/dagster.md‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎apps/docs/docs/contribute-data/database.md‎
Lines changed: 38 additions & 43 deletions b/‎apps/docs/docs/contribute-data/database.md‎
Lines changed: 38 additions & 43 deletions
@@ -1,47 +1,48 @@
 # .env
 ## This .env file is mostly used for Python data ops
 
-## Google Cloud setup
-# You will need to generate Google application credentials
-# Note: You can use your gcloud auth credentials
-GOOGLE_APPLICATION_CREDENTIALS=<path-to-valid-gcp-creds>
-# GCP project ID
-GOOGLE_PROJECT_ID=
-# Used for storing all BigQuery data in the dbt pipeline
-BIGQUERY_DATASET_ID=
-
 ## Dagster Setup
 # You may want to change the location of dagster home if you want it to survive resets 
 DAGSTER_HOME=/tmp/dagster-home 
 
-# This is used to put generated dbt profiles for dagster in a specific place
-DAGSTER_DBT_TARGET_BASE_DIR=/tmp/dagster-home/generated-dbt
-DAGSTER_DBT_PARSE_PROJECT_ON_LOAD=1
-
-# Used when loading dlt assets into a staging area. It should be set to a GCS
-# bucket that will be used to write to for dlt data transfers into bigquery.
-DAGSTER_STAGING_BUCKET_URL=gs://some-bucket
+## sqlmesh
+SQLMESH_DUCKDB_LOCAL_PATH=/tmp/oso.duckdb
+#SQLMESH_DUCKDB_LOCAL_TRINO_PATH=/tmp/oso-trino.duckdb
 
 # Uncomment the next two vars to use gcp secrets (you'll need to have gcp
 # secrets configured). Unfortunately at this time, if you don't have access to
 # the official oso gcp account uncommenting these will likely not work. The GCP
-# secrets prefix should likely match the dagster deployment's search prefix in
-# flux
-#DAGSTER_USE_LOCAL_SECRETS=False
+# secrets prefix should likely match the dagster deployment's search prefix in flux
+DAGSTER_USE_LOCAL_SECRETS=True
 #DAGSTER_GCP_SECRETS_PREFIX=dagster 
 
+## Google Cloud setup
+# You will need to generate Google application credentials.
+# You can log in via `gcloud auth application-default login`
+# Then you can enter the path to your credentials
+# e.g. /home/user/.config/gcloud/application_default_credentials.json
+GOOGLE_APPLICATION_CREDENTIALS=
+# GCP project ID
+GOOGLE_PROJECT_ID=
+# Used for storing all BigQuery data in the dbt pipeline
+BIGQUERY_DATASET_ID=
+# Used when loading dlt assets into a staging area. It should be set to a GCS
+# bucket that will be used to write to for dlt data transfers into bigquery.
+DAGSTER_STAGING_BUCKET_URL=gs://some-bucket
+
 ## Clickhouse setup
 DAGSTER__CLICKHOUSE__HOST=
 DAGSTER__CLICKHOUSE__USER=
 DAGSTER__CLICKHOUSE__PASSWORD=
 
-## sqlmesh
-SQLMESH_DUCKDB_LOCAL_PATH=/tmp/oso.duckdb
-
 ###################
 # DEPRECATED
 ###################
 
+# This is used to put generated dbt profiles for dagster in a specific place
+DAGSTER_DBT_TARGET_BASE_DIR=/tmp/dagster-home/generated-dbt
+DAGSTER_DBT_PARSE_PROJECT_ON_LOAD=0
+
 # Used for data transfer between databases
 CLOUDSTORAGE_BUCKET_NAME=
 
@@ -50,4 +51,5 @@ CLOUDSQL_REGION=
 CLOUDSQL_INSTANCE_ID=
 CLOUDSQL_DB_NAME=
 CLOUDSQL_DB_PASSWORD=
-CLOUDSQL_DB_USER=
+CLOUDSQL_DB_USER=
+
@@ -45,87 +45,3 @@ Add the `allAuthenticatedUsers` as the "BigQuery Data Viewer"
 If you have reasons to keep your dataset private,
 you can reach out to us directly on our
 [Discord](https://www.opensource.observer/discord).
-
-## Defining a dbt source
-
-For example, Google maintains a
-[public dataset](https://cloud.google.com/blog/products/data-analytics/ethereum-bigquery-public-dataset-smart-contract-analytics)
-for Ethereum mainnet.
-
-As long as the dataset is publicly available in the US region,
-we can create a dbt source in `oso/warehouse/dbt/models/`
-(see [source](https://github.com/opensource-observer/oso/blob/main/warehouse/dbt/models/ethereum_sources.yml)):
-
-```yaml
-sources:
-  - name: ethereum
-    database: bigquery-public-data
-    schema: crypto_ethereum
-    tables:
-      - name: transactions
-        identifier: transactions
-      - name: traces
-        identifier: traces
-```
-
-We can then reference these tables in a downstream model with
-the `source` macro:
-
-```sql
-select
-  block_timestamp,
-  `hash` as transaction_hash,
-  from_address,
-  receipt_contract_address
-from {{ source("ethereum", "transactions") }}
-```
-
-## Creating a playground dataset (optional)
-
-If the source table is large, we will want to
-extract a subset of the data into a playground dataset
-for testing and development.
-
-For example for GitHub event data,
-we copy just the last 14 days of data
-into a playground dataset, which is used
-when the dbt target is set to `playground`
-(see [source](https://github.com/opensource-observer/oso/blob/main/warehouse/dbt/models/github_sources.yml)):
-
-```yaml
-sources:
-  - name: github_archive
-    database: |
-      {%- if target.name in ['playground', 'dev'] -%} opensource-observer
-      {%- elif target.name == 'production' -%} githubarchive
-      {%- else -%} invalid_database
-      {%- endif -%}
-    schema: |
-      {%- if target.name in ['playground', 'dev'] -%} oso
-      {%- elif target.name == 'production' -%} day
-      {%- else -%} invalid_schema
-      {%- endif -%}
-    tables:
-      - name: events
-        identifier: |
-          {%- if target.name in ['playground', 'dev'] -%} stg_github__events
-          {%- elif target.name == 'production' -%} 20*
-          {%- else -%} invalid_table
-          {%- endif -%}
-```
-
-### Choosing a playground window size
-
-There is a fine balance between choosing a playground data set window
-that is sufficiently small for affordable testing and development,
-yet produces meaningful results to detect issues in your queries.
-
-:::warning
-Coming soon... This section is a work in progress.
-:::
-
-### Copying the playground dataset
-
-:::warning
-Coming soon... This section is a work in progress.
-:::
@@ -1,6 +1,7 @@
 ---
 title: BigQuery Data Transfer Service
-sidebar_position: 6
+sidebar_position: 2
+sidebar_class_name: hidden
 ---
 
 BigQuery comes with a built-in data transfer service
 
@@ -1,13 +1,14 @@
 ---
 title: Write a Custom Dagster Asset
-sidebar_position: 6
+sidebar_position: 7
 ---
 
 Before writing a fully custom Dagster asset,
 we recommend you first see if the previous guides on
 [BigQuery datasets](./bigquery.md),
 [database replication](./database.md),
-[API crawling](./api-crawling/index.md)
+[Graph API crawling](./graphql-api.md),
+or [REST API crawling](./rest-api.md)
 may be a better fit.
 This guide should only be used in the rare cases where you cannot
 use the other methods.
 
@@ -1,26 +1,26 @@
 ---
-title: Provide Access to Your Database
+title: Replicate your SQL Database
 sidebar_position: 3
 ---
 
-OSO's dagster infrastructure has support for database replication into our data
+OSO's Dagster infrastructure has support for database replication into our data
 warehouse by using Dagster's "embedded-elt" that integrates with the library
 [dlt](https://dlthub.com/).
 
-## Configure your database as a dagster asset
+## Configure your database as a Dagster asset
 
-There are many possible ways to configure a database as a dagster asset,
-however, to reduce complexity of configuration we provide a single interface for
-specifying a SQL database for replication. The SQL database _must_ be a database
-that is [supported by
-dlt](https://dlthub.com/devel/dlt-ecosystem/verified-sources/sql_database). In
-general, we replicate _all_ columns and for now custom column selection is not
+There are many possible ways to configure a database as a Dagster asset.
+To simplify things, we have built a factory function, `sql_assets`,
+to automatically replicate any SQL database.
+The SQL database _must_ be a database that is
+[supported by dlt](https://dlthub.com/devel/dlt-ecosystem/verified-sources/sql_database).
+In general, we replicate _all_ columns and for now custom column selection is not
 available in our interface.
 
-This section shows how to setup a database with two tables as a set of sql
-assets. The table named `some_incremental_database` has a chronologically
-organized or updated dataset and can therefore be loaded incrementally. The
-second table, `some_nonincremental_database`, does not have a way to be loaded
+This section shows how to replciate 2 tables in a database.
+The first table named `some_incremental_database` has a time column
+and can be loaded incrementally.
+The second table, `some_nonincremental_database`, does not have a way to be loaded
 incrementally and will force a full refresh upon every sync.
 
 To setup this database replication, you can add a new python file to
@@ -52,25 +52,21 @@ my_database = sql_assets(
 ```
 
 The first three lines of the file import some necessary tooling to configure a
-sql database:
-
-- The first import, `sql_assets`, is an asset factory created by the OSO team
-  that enables this "easy" configuration of sql assets.
-- The second import, `SecretReference`, is a tool used to reference a secret in
-  a secret resolver. The secret resolver can be configured differently based on
-  the environment, but on production we use this to reference a cloud based secret
-  manager.
-- The final import, `incremental`, is used to specify a column to use for
-  incremental loading. This is a `dlt` constructor that is passed to the
-  configuration.
-
-The `sql_assets`, factory takes 3 arguments:
-
-- The first argument is an asset key prefix which is used to both specify an
-  asset key prefix and also used when generating asset related names inside the
-  factory. In general, this should match the filename of the containing python
-  file unless you have a more complex set of assets to configure. This name is
-  also used as the dataset name into which this data will be loaded.
+SQL database:
+
+- `sql_assets`: an asset factory created by the OSO team
+  that enables this simple configuration of SQL assets.
+- `SecretReference`: a secret reference in the OSO a secret resolver.
+  The secret resolver can be configured differently based on
+  the environment. On production, we use a cloud-based secret manager.
+- `incremental`: used to specify a column to use for incremental loading.
+  This is a `dlt` constructor that is passed to the configuration.
+
+The `sql_assets` factory takes 3 arguments:
+
+- The first argument is an asset key prefix, used to group assets generated
+  by the factory. In general, this should match the filename of the python
+  file unless you have more complex requirements.
 - The second argument must be a `SecretReference` object that will be used to
   retrieve the credentials that you will provide at a later step to the OSO
   team. The `SecretReference` object has two required keyword arguments:
@@ -81,11 +77,10 @@ The `sql_assets`, factory takes 3 arguments:
   - `key` - This is an arbitrary name for the secret.
 
 - The third argument is a list of dictionaries that define options for tables
-  that should be replicated into the data warehouse. The most important options
-  here are:
+  that should be replicated into OSO.
 
-  - `table` - The table name
-  - `destination_table_name` - The table name to use in the data warehouse
+  - `table` - The source table name
+  - `destination_table_name` - The destination table name to use in the OSO data lake
   - `incremental` - An `incremental` object that defines time/date based column
     to use for incrementally loading a database.
 
@@ -95,11 +90,11 @@ The `sql_assets`, factory takes 3 arguments:
 
 ## Enabling access to your database
 
-Before the OSO infrastructure can begin to synchronize your database to the data
-warehouse, it will need to be provided access to the database. At this time
-there is no automated process for this. Once you're ready to get your database
-integrated, you will want to contact the OSO team on our
-[Discord](https://www.opensource.observer/discord). Be prepared to provide
-credentials (we will work out a secure method of transmission) and also ensure
-that you have access to update any firewall settings that may be required for us
+For the asset to run in OSO production, we will need access to
+your secrets (e.g. password or connection string).
+At this time there is no automated process for this.
+You can contact the OSO team on our
+[Discord](https://www.opensource.observer/discord).
+Be prepared to provide credentials via a secure method of transmission.
+Also remember to update any firewall settings that may be required for us
 to access your database server.