korean_dashbaord/fund_scraper.py at main · karhites/korean_dashbaord · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
"""
Main scraper for Korean fund management reports (자산운용보고서).
Searches DART for semiconductor/memory fund reports, downloads them,
and parses holdings data.
"""

import re
import time
import pandas as pd
from datetime import datetime
from dart_api import DartAPI
from holdings_parser import find_holdings_in_document, parse_fund_report_full
from storage import (
    init_db, save_filing, save_holdings, mark_filing_parsed,
    get_unparsed_filings
)
from config import SEMICONDUCTOR_KEYWORDS, MAJOR_ASSET_MANAGERS


class FundScraper:
    def __init__(self):
        self.api = DartAPI()
        init_db()

    # -------------------------------------------------------
    # Step 1: Search for semiconductor/memory fund reports
    # -------------------------------------------------------
    def search_semiconductor_funds(self, start='2023-01-01', end=None, max_pages=20):
        """
        Search DART for fund management reports (자산운용보고서) that are
        related to semiconductors/memory.

        Strategy:
        - Search kind='G' (fund disclosures) for all fund reports
        - Filter by report_nm and corp_name for semiconductor keywords
        """
        print("=" * 70)
        print("SEARCHING FOR SEMICONDUCTOR/MEMORY FUND REPORTS")
        print("=" * 70)

        if end is None:
            end = datetime.today().strftime('%Y-%m-%d')

        print(f"\nDate range: {start} to {end}")
        print(f"Searching fund disclosures (kind='G')...\n")

        df = self.api.search_filings(
            start=start, end=end, kind='G', max_pages=max_pages
        )

        if df.empty:
            print("No fund filings found.")
            return pd.DataFrame()

        print(f"\nTotal fund filings found: {len(df)}")

        # Filter for 자산운용보고서 specifically
        report_mask = df['report_nm'].str.contains('자산운용보고서', na=False)
        df_reports = df[report_mask].copy()
        print(f"자산운용보고서 (fund management reports): {len(df_reports)}")

        # Filter for semiconductor/memory keywords in name
        keyword_pattern = '|'.join(SEMICONDUCTOR_KEYWORDS)
        semi_mask = (
            df_reports['report_nm'].str.contains(keyword_pattern, case=False, na=False) |
            df_reports['corp_name'].str.contains(keyword_pattern, case=False, na=False)
        )
        df_semi = df_reports[semi_mask].copy()
        print(f"Semiconductor/memory related: {len(df_semi)}")

        # Also filter for major asset managers
        manager_pattern = '|'.join(MAJOR_ASSET_MANAGERS)
        manager_mask = df_reports['corp_name'].str.contains(manager_pattern, case=False, na=False)
        # Among major managers, look for tech/semiconductor fund reports
        tech_keywords = '반도체|메모리|테크|IT|4차|TIGER|KODEX|ACE|SOL|성장|혁신|미래'
        tech_in_name = df_reports['report_nm'].str.contains(tech_keywords, case=False, na=False)
        df_managers = df_reports[manager_mask & tech_in_name].copy()
        print(f"Major asset managers' tech/semi funds: {len(df_managers)}")

        # Combine and deduplicate
        combined = pd.concat([df_semi, df_managers]).drop_duplicates(subset='rcept_no')
        print(f"\nCombined unique filings: {len(combined)}")

        if len(combined) > 0:
            print(f"\nSample filings found:")
            for _, row in combined.head(20).iterrows():
                print(f"  [{row.get('rcept_dt','')}] {row.get('corp_name','')} - {row.get('report_nm','')[:60]}")
                # Save to DB
                save_filing(
                    rcp_no=row['rcept_no'],
                    corp_code=row.get('corp_code', ''),
                    corp_name=row.get('corp_name', ''),
                    report_name=row.get('report_nm', ''),
                    filing_date=row.get('rcept_dt', ''),
                    fund_name=self._extract_fund_name(row.get('report_nm', '')),
                )

        return combined

    def search_by_fund_name(self, fund_name, start='2020-01-01', end=None, max_pages=30):
        """
        Search for a specific fund by name.
        fund_name: the name of the fund/company to search (e.g., '삼성자산운용')
        """
        print(f"\n{'='*70}")
        print(f"SEARCHING FOR: {fund_name}")
        print(f"{'='*70}")

        if end is None:
            end = datetime.today().strftime('%Y-%m-%d')

        # Try both kind='G' (fund disclosures) and general search
        df = self.api.search_filings(
            start=start, end=end, kind='G', max_pages=max_pages
        )

        if df.empty:
            print("No filings found.")
            return pd.DataFrame()

        # Filter for the fund name
        mask = (
            df['corp_name'].str.contains(fund_name, case=False, na=False) |
            df['report_nm'].str.contains(fund_name, case=False, na=False)
        )
        matched = df[mask].copy()

        # Further filter for 자산운용보고서
        report_mask = matched['report_nm'].str.contains('자산운용보고서', na=False)
        reports = matched[report_mask]

        print(f"Found {len(matched)} total filings, {len(reports)} 자산운용보고서")

        for _, row in reports.iterrows():
            save_filing(
                rcp_no=row['rcept_no'],
                corp_code=row.get('corp_code', ''),
                corp_name=row.get('corp_name', ''),
                report_name=row.get('report_nm', ''),
                filing_date=row.get('rcept_dt', ''),
                fund_name=self._extract_fund_name(row.get('report_nm', '')),
            )

        return reports

    # -------------------------------------------------------
    # Step 2: Download and parse fund reports
    # -------------------------------------------------------
    def parse_filing(self, rcp_no, corp_name='', filing_date=''):
        """
        Download a specific filing and parse its holdings.
        Uses sub_docs to find the holdings section, then parses HTML tables.
        """
        print(f"\n  Parsing filing: {rcp_no}")

        # Get sub-documents to find the portfolio holdings section
        sub_docs = self.api.get_sub_docs(rcp_no)
        if sub_docs.empty:
            print(f"    No sub-documents found")
            mark_filing_parsed(rcp_no)
            return pd.DataFrame()

        print(f"    Sub-documents: {len(sub_docs)}")
        for _, doc in sub_docs.iterrows():
            print(f"      - {doc['title']}")

        # Look for the holdings/portfolio section
        # Keywords: 자산구성, 자산현황, 보유종목, 투자대상, 자산운용
        holdings_keywords = ['자산구성', '자산현황', '보유', '투자대상', '자산운용', '운용현황', '구성내역']
        target_docs = []

        for _, doc in sub_docs.iterrows():
            title = doc['title']
            if any(kw in title for kw in holdings_keywords):
                target_docs.append(doc)

        # If no specific match, try parsing all sub-documents
        if not target_docs:
            print(f"    No specific holdings section found, trying all sub-docs...")
            target_docs = [row for _, row in sub_docs.iterrows()]

        all_holdings = []
        for doc in target_docs:
            print(f"    Fetching: {doc['title']}")
            html = self.api.fetch_sub_doc_html(doc['url'])
            if not html:
                continue

            holdings_df = find_holdings_in_document(html)
            if not holdings_df.empty:
                print(f"      Found {len(holdings_df)} holdings!")
                all_holdings.append(holdings_df)
            else:
                # Try the full parser for more context
                result = parse_fund_report_full(html)
                if result['holdings']:
                    for h_df in result['holdings']:
                        print(f"      Found {len(h_df)} holdings (full parse)!")
                        all_holdings.append(h_df)
                elif result['raw_tables']:
                    print(f"      Found {len(result['raw_tables'])} tables (no holdings pattern matched)")
                    for t in result['raw_tables'][:3]:
                        print(f"        Table: {t['rows']}x{t['cols']} near '{t['context'][:50]}'")

        if all_holdings:
            combined = pd.concat(all_holdings, ignore_index=True)
            # Extract fund name from report
            fund_name = self._extract_fund_name(
                sub_docs.iloc[0]['title'] if not sub_docs.empty else ''
            )
            save_holdings(combined, rcp_no, filing_date, fund_name, corp_name)
            print(f"    Saved {len(combined)} holdings to database")
            return combined
        else:
            mark_filing_parsed(rcp_no)
            print(f"    No holdings data found in this filing")
            return pd.DataFrame()

    def parse_all_unparsed(self, limit=None):
        """Parse all filings that haven't been parsed yet"""
        unparsed = get_unparsed_filings()
        if unparsed.empty:
            print("No unparsed filings.")
            return

        total = len(unparsed)
        if limit:
            unparsed = unparsed.head(limit)

        print(f"\nParsing {len(unparsed)} of {total} unparsed filings...")

        for i, (_, filing) in enumerate(unparsed.iterrows()):
            print(f"\n[{i+1}/{len(unparsed)}] {filing['corp_name']} - {filing['report_name'][:50]}")
            try:
                self.parse_filing(
                    filing['rcp_no'],
                    corp_name=filing['corp_name'],
                    filing_date=filing['filing_date'],
                )
            except Exception as e:
                print(f"  ERROR: {e}")
                mark_filing_parsed(filing['rcp_no'])
            time.sleep(0.5)  # Be nice to the server

    # -------------------------------------------------------
    # Step 3: Download raw document for manual inspection
    # -------------------------------------------------------
    def download_raw_document(self, rcp_no, save_dir='raw_docs'):
        """Download the raw XML/HTML documents from a filing for inspection"""
        import os
        os.makedirs(save_dir, exist_ok=True)

        docs = self.api.download_document(rcp_no)
        if not docs:
            print(f"  Failed to download documents for {rcp_no}")
            return []

        saved = []
        for fname, content in docs.items():
            filepath = os.path.join(save_dir, f'{rcp_no}_{fname}')
            with open(filepath, 'w', encoding='utf-8') as f:
                f.write(content)
            saved.append(filepath)
            print(f"  Saved: {filepath} ({len(content):,} chars)")

        return saved

    # -------------------------------------------------------
    # Utility
    # -------------------------------------------------------
    def _extract_fund_name(self, report_name):
        """Extract the fund name from a report title"""
        # Report names often look like:
        # "자산운용보고서(삼성 코리아 대표 증권 자투자신탁 제1호[주식])"
        match = re.search(r'[(\[](.*?)[)\]]', report_name)
        if match:
            return match.group(1).strip()
        # Or just return the report name cleaned up
        name = report_name.replace('자산운용보고서', '').strip()
        return name if name else report_name


if __name__ == '__main__':
    scraper = FundScraper()

    # Search for semiconductor/memory fund reports from 2023 onwards
    print("\n" + "=" * 70)
    print("STEP 1: Searching for semiconductor/memory fund reports")
    print("=" * 70)
    filings = scraper.search_semiconductor_funds(start='2024-01-01')

    if not filings.empty:
        print(f"\n\nSTEP 2: Parsing first 5 filings to extract holdings...")
        scraper.parse_all_unparsed(limit=5)