22import multiprocessing
33from collections import defaultdict
44from concurrent .futures import ThreadPoolExecutor , Future
5+ import concurrent .futures
56from contextlib import suppress
67from logging import getLogger
78from queue import Queue
89from threading import Event
910from time import sleep
10- from typing import Dict , Optional , List , Any , Tuple
11+ from typing import Dict , Optional , List , Any , Tuple , Set
12+ from pathlib import Path
1113
1214import pkg_resources
1315import yaml
2527from rich .live import Live
2628from sqlalchemy .engine import Engine
2729
30+
2831from cloud2sql .analytics import AnalyticsEventSender
2932from cloud2sql .show_progress import CollectInfo
3033from cloud2sql .sql import SqlUpdater , sql_updater
34+ from cloud2sql .parquet import ParquetModel , ParquetWriter
35+
3136
3237log = getLogger ("resoto.cloud2sql" )
3338
@@ -65,6 +70,73 @@ def configure(path_to_config: Optional[str]) -> Json:
6570
6671
6772def collect (
73+ collector : BaseCollectorPlugin , engine : Optional [Engine ], feedback : CoreFeedback , args : Namespace , config : Json
74+ ) -> Tuple [str , int , int ]:
75+ if engine :
76+ return collect_sql (collector , engine , feedback , args )
77+ else :
78+ return collect_parquet (collector , feedback , config )
79+
80+
81+ def prepare_node (node : BaseResource , collector : BaseCollectorPlugin ) -> Json :
82+ node ._graph = collector .graph
83+ exported = node_to_dict (node )
84+ exported ["type" ] = "node"
85+ exported ["ancestors" ] = {
86+ "cloud" : {"reported" : {"id" : node .cloud ().name }},
87+ "account" : {"reported" : {"id" : node .account ().name }},
88+ "region" : {"reported" : {"id" : node .region ().name }},
89+ "zone" : {"reported" : {"id" : node .zone ().name }},
90+ }
91+ return exported
92+
93+
94+ def collect_parquet (collector : BaseCollectorPlugin , feedback : CoreFeedback , config : Json ) -> Tuple [str , int , int ]:
95+ # collect cloud data
96+ feedback .progress_done (collector .cloud , 0 , 1 )
97+ collector .collect ()
98+ # read the kinds created from this collector
99+ kinds = [from_json (m , Kind ) for m in collector .graph .export_model (walk_subclasses = False )]
100+ model = ParquetModel (Model ({k .fqn : k for k in kinds }))
101+ node_edge_count = len (collector .graph .nodes ) + len (collector .graph .edges )
102+ ne_current = 0
103+ progress_update = node_edge_count // 100
104+ feedback .progress_done ("sync_db" , 0 , node_edge_count , context = [collector .cloud ])
105+
106+ # group all edges by kind of from/to
107+ edges_by_kind : Set [Tuple [str , str ]] = set ()
108+ for from_node , to_node , key in collector .graph .edges :
109+ if key .edge_type == EdgeType .default :
110+ edges_by_kind .add ((from_node .kind , to_node .kind ))
111+ # create the ddl metadata from the kinds
112+ model .create_schema (list (edges_by_kind ))
113+ # ingest the data
114+ parquet_conf = config .get ("destinations" , {}).get ("parquet" )
115+ assert parquet_conf
116+ parquet_path = Path (parquet_conf ["path" ])
117+ parquet_batch_size = int (parquet_conf ["batch_size" ])
118+ writer = ParquetWriter (model , parquet_path , parquet_batch_size )
119+ node : BaseResource
120+ for node in sorted (collector .graph .nodes , key = lambda n : n .kind ):
121+ exported = prepare_node (node , collector )
122+ writer .insert_node (exported )
123+ ne_current += 1
124+ if ne_current % progress_update == 0 :
125+ feedback .progress_done ("sync_db" , ne_current , node_edge_count , context = [collector .cloud ])
126+ for from_node , to_node , key in collector .graph .edges :
127+ if key .edge_type == EdgeType .default :
128+ writer .insert_node ({"from" : from_node .chksum , "to" : to_node .chksum , "type" : "edge" })
129+ ne_current += 1
130+ if ne_current % progress_update == 0 :
131+ feedback .progress_done ("sync_db" , ne_current , node_edge_count , context = [collector .cloud ])
132+
133+ writer .close ()
134+
135+ feedback .progress_done (collector .cloud , 1 , 1 )
136+ return collector .cloud , len (collector .graph .nodes ), len (collector .graph .edges )
137+
138+
139+ def collect_sql (
68140 collector : BaseCollectorPlugin , engine : Engine , feedback : CoreFeedback , args : Namespace
69141) -> Tuple [str , int , int ]:
70142 # collect cloud data
@@ -75,16 +147,8 @@ def collect(
75147 nodes_by_kind : Dict [str , List [Json ]] = defaultdict (list )
76148 node : BaseResource
77149 for node in collector .graph .nodes :
78- node ._graph = collector .graph
79150 # create an exported node with the same scheme as resotocore
80- exported = node_to_dict (node )
81- exported ["type" ] = "node"
82- exported ["ancestors" ] = {
83- "cloud" : {"reported" : {"id" : node .cloud ().name }},
84- "account" : {"reported" : {"id" : node .account ().name }},
85- "region" : {"reported" : {"id" : node .region ().name }},
86- "zone" : {"reported" : {"id" : node .zone ().name }},
87- }
151+ exported = prepare_node (node , collector )
88152 nodes_by_kind [node .kind ].append (exported )
89153
90154 # group all edges by kind of from/to
@@ -138,33 +202,35 @@ def show_messages(core_messages: Queue[Json], end: Event) -> None:
138202 rich_print (message )
139203
140204
141- def collect_from_plugins (engine : Engine , args : Namespace , sender : AnalyticsEventSender ) -> None :
205+ def collect_from_plugins (engine : Optional [ Engine ] , args : Namespace , sender : AnalyticsEventSender ) -> None :
142206 # the multiprocessing manager is used to share data between processes
143207 mp_manager = multiprocessing .Manager ()
144208 core_messages : Queue [Json ] = mp_manager .Queue ()
145209 feedback = CoreFeedback ("cloud2sql" , "collect" , "collect" , core_messages )
146210 raw_config = configure (args .config )
147211 sources = raw_config ["sources" ]
148212 all_collectors = collectors (sources , feedback )
149- analytics = {"total" : len (all_collectors ), "engine" : engine .dialect .name } | {name : 1 for name in all_collectors }
213+ engine_name = engine .dialect .name if engine else "parquet"
214+ analytics = {"total" : len (all_collectors ), "engine" : engine_name } | {name : 1 for name in all_collectors }
150215 end = Event ()
151216 with ThreadPoolExecutor (max_workers = 4 ) as executor :
152217 try :
153218 if args .show == "progress" :
154219 executor .submit (show_messages , core_messages , end )
155220 futures : List [Future [Any ]] = []
156221 for collector in all_collectors .values ():
157- futures .append (executor .submit (collect , collector , engine , feedback , args ))
222+ futures .append (executor .submit (collect , collector , engine , feedback , args , raw_config ))
158223 for future in concurrent .futures .as_completed (futures ):
159224 name , nodes , edges = future .result ()
160225 analytics [f"{ name } _nodes" ] = nodes
161226 analytics [f"{ name } _edges" ] = edges
162227 sender .capture ("collect" , ** analytics )
163228 # when all collectors are done, we can swap all temp tables
164- swap_tables = "Make latest snapshot available"
165- feedback .progress_done (swap_tables , 0 , 1 )
166- SqlUpdater .swap_temp_tables (engine )
167- feedback .progress_done (swap_tables , 1 , 1 )
229+ if engine :
230+ swap_tables = "Make latest snapshot available"
231+ feedback .progress_done (swap_tables , 0 , 1 )
232+ SqlUpdater .swap_temp_tables (engine )
233+ feedback .progress_done (swap_tables , 1 , 1 )
168234 except Exception as e :
169235 # set end and wait for live to finish, otherwise the cursor is not reset
170236 end .set ()
0 commit comments