Skip to content

Commit c16b706

Browse files
committed
feat: new ref tracking tables
1 parent 323d4d0 commit c16b706

File tree

6 files changed

+429
-157
lines changed

6 files changed

+429
-157
lines changed

README.md

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,13 +50,27 @@ pnpm crawl -- --brand 59 --model 5940 # Modelo específico
5050
pnpm crawl -- --brand 59 --model 5940,5941 # Múltiplos modelos
5151
pnpm crawl -- --reference 328 # Tabela de referência específica
5252
pnpm crawl -- --classify # Classificar modelos novos via AI
53+
pnpm crawl -- --force # Re-buscar tudo ignorando status de sync
5354
ALLOWED_BRANDS=21,22,23 pnpm crawl # Limitar marcas via env
5455

5556
pnpm status # Estatísticas do banco
5657
pnpm classify # Classificar modelos sem segmento
5758
pnpm classify -- --dry-run # Preview da classificação
5859
```
5960

61+
## Como Funciona o Crawl
62+
63+
O crawler usa um sistema de **sync granular** que rastreia o progresso por tabela de referência:
64+
65+
1. **Fase 1 - Brands**: Busca marcas da API e armazena no banco
66+
2. **Fase 2 - Models**: Para cada marca, busca modelos
67+
3. **Fase 3 - Model-Years**: Para cada modelo, busca anos/combustíveis
68+
4. **Fase 4 - Prices**: Para cada ano, busca o preço
69+
70+
Cada fase é rastreada independentemente. Se o crawl for interrompido, continua de onde parou na próxima execução.
71+
72+
Use `--force` para ignorar o status de sync e re-buscar todos os dados.
73+
6074
## Docker
6175

6276
```bash

initial.sql

Lines changed: 35 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,8 @@ CREATE TABLE IF NOT EXISTS reference_tables (
33
code INTEGER UNIQUE NOT NULL,
44
month INTEGER NOT NULL,
55
year INTEGER NOT NULL,
6-
crawled_at TIMESTAMP
6+
crawled_at TIMESTAMP,
7+
brands_crawled_at TIMESTAMP
78
);
89

910
CREATE TABLE IF NOT EXISTS brands (
@@ -55,3 +56,36 @@ CREATE INDEX IF NOT EXISTS idx_models_segment ON models(segment);
5556
CREATE INDEX IF NOT EXISTS idx_reference_year_month ON reference_tables(year, month);
5657
CREATE INDEX IF NOT EXISTS idx_model_years_year ON model_years(year);
5758
CREATE INDEX IF NOT EXISTS idx_brands_name ON brands(name);
59+
60+
-- Crawl status tracking tables (per reference)
61+
CREATE TABLE IF NOT EXISTS reference_brands (
62+
id SERIAL PRIMARY KEY,
63+
reference_table_id INTEGER NOT NULL REFERENCES reference_tables(id),
64+
brand_id INTEGER NOT NULL REFERENCES brands(id),
65+
models_crawled_at TIMESTAMP,
66+
UNIQUE(reference_table_id, brand_id)
67+
);
68+
69+
CREATE TABLE IF NOT EXISTS reference_models (
70+
id SERIAL PRIMARY KEY,
71+
reference_table_id INTEGER NOT NULL REFERENCES reference_tables(id),
72+
model_id INTEGER NOT NULL REFERENCES models(id),
73+
years_crawled_at TIMESTAMP,
74+
UNIQUE(reference_table_id, model_id)
75+
);
76+
77+
CREATE TABLE IF NOT EXISTS reference_model_years (
78+
id SERIAL PRIMARY KEY,
79+
reference_table_id INTEGER NOT NULL REFERENCES reference_tables(id),
80+
model_year_id INTEGER NOT NULL REFERENCES model_years(id),
81+
price_crawled_at TIMESTAMP,
82+
UNIQUE(reference_table_id, model_year_id)
83+
);
84+
85+
-- Crawl status indexes
86+
CREATE INDEX IF NOT EXISTS idx_reference_brands_ref ON reference_brands(reference_table_id);
87+
CREATE INDEX IF NOT EXISTS idx_reference_brands_brand ON reference_brands(brand_id);
88+
CREATE INDEX IF NOT EXISTS idx_reference_models_ref ON reference_models(reference_table_id);
89+
CREATE INDEX IF NOT EXISTS idx_reference_models_model ON reference_models(model_id);
90+
CREATE INDEX IF NOT EXISTS idx_reference_model_years_ref ON reference_model_years(reference_table_id);
91+
CREATE INDEX IF NOT EXISTS idx_reference_model_years_my ON reference_model_years(model_year_id);

0 commit comments

Comments
 (0)