Skip to content

Commit d3f7796

Browse files
committed
add crawling
1 parent 200a343 commit d3f7796

File tree

7 files changed

+542
-2
lines changed

7 files changed

+542
-2
lines changed

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ dependencies = [
1111
"django-celery-beat>=2.8.1",
1212
"django-celery-results>=2.6.0",
1313
"django-tailwind[reload]>=3.8.0",
14+
"feedparser>=6.0.11",
1415
"gunicorn>=23.0.0",
1516
"httpx>=0.28.1",
1617
"llm>=0.24.2",

pythonkr_backend/curation/admin.py

Lines changed: 78 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
from django.contrib import admin, messages
2-
from .models import Article, Category # Or combine imports
2+
from .models import Article, Category, RSSFeed, RSSItem # Or combine imports
33

44

55
@admin.register(Category)
@@ -86,3 +86,80 @@ def summary_ko_preview(self, obj):
8686
preview = obj.summary_ko[:50]
8787
return f"{preview}..." if len(obj.summary_ko) > 50 else preview
8888
return "No Korean summary"
89+
90+
91+
@admin.register(RSSFeed)
92+
class RSSFeedAdmin(admin.ModelAdmin):
93+
list_display = ('name', 'url', 'is_active', 'last_fetched', 'item_count', 'created_at')
94+
list_filter = ('is_active', 'created_at', 'last_fetched')
95+
search_fields = ('name', 'url')
96+
readonly_fields = ('last_fetched', 'created_at', 'updated_at')
97+
98+
fieldsets = (
99+
('Feed Information', {
100+
'fields': ('name', 'url', 'is_active')
101+
}),
102+
('Status', {
103+
'fields': ('last_fetched', 'created_at', 'updated_at'),
104+
'classes': ('collapse',)
105+
}),
106+
)
107+
108+
@admin.display(description='Items Count')
109+
def item_count(self, obj):
110+
return obj.items.count()
111+
112+
actions = ['crawl_selected_feeds']
113+
114+
@admin.action(description="Crawl selected RSS feeds")
115+
def crawl_selected_feeds(self, request, queryset):
116+
from .tasks import crawl_single_rss_feed
117+
118+
success_count = 0
119+
total_new_items = 0
120+
errors = []
121+
122+
for feed in queryset:
123+
try:
124+
result = crawl_single_rss_feed(feed.id)
125+
success_count += 1
126+
total_new_items += result.get('new_items', 0)
127+
except Exception as e:
128+
errors.append(f"{feed.name}: {str(e)}")
129+
130+
if success_count > 0:
131+
self.message_user(
132+
request,
133+
f"Successfully crawled {success_count} feed(s). Found {total_new_items} new items.",
134+
messages.SUCCESS
135+
)
136+
137+
if errors:
138+
error_message = "Errors encountered:\n" + "\n".join(errors)
139+
self.message_user(request, error_message, messages.WARNING)
140+
141+
142+
@admin.register(RSSItem)
143+
class RSSItemAdmin(admin.ModelAdmin):
144+
list_display = ('title', 'feed', 'author', 'pub_date', 'created_at')
145+
list_filter = ('feed', 'pub_date', 'created_at', 'author')
146+
search_fields = ('title', 'description', 'author', 'link')
147+
readonly_fields = ('created_at',)
148+
date_hierarchy = 'pub_date'
149+
150+
fieldsets = (
151+
('Item Information', {
152+
'fields': ('feed', 'title', 'link', 'author', 'category')
153+
}),
154+
('Content', {
155+
'fields': ('description',),
156+
'classes': ('collapse',)
157+
}),
158+
('Metadata', {
159+
'fields': ('guid', 'pub_date', 'created_at'),
160+
'classes': ('collapse',)
161+
}),
162+
)
163+
164+
def get_queryset(self, request):
165+
return super().get_queryset(request).select_related('feed')
Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
# Generated by Django 5.2.1 on 2025-06-11 12:16
2+
3+
import datetime
4+
import django.db.models.deletion
5+
from django.db import migrations, models
6+
7+
8+
class Migration(migrations.Migration):
9+
10+
dependencies = [
11+
('curation', '0006_article_categories'),
12+
]
13+
14+
operations = [
15+
migrations.CreateModel(
16+
name='CrawlingSources',
17+
fields=[
18+
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
19+
('name', models.CharField(max_length=256, verbose_name='Crawling Source Name')),
20+
('rss_feed_url', models.URLField(blank=True, null=True, verbose_name='RSS Feed URL')),
21+
('fetch_interval', models.DurationField(default=datetime.timedelta(seconds=3600), help_text='크롤링 주기 (시간 단위)')),
22+
('created_at', models.DateTimeField(auto_now_add=True)),
23+
('updated_at', models.DateTimeField(auto_now=True)),
24+
],
25+
),
26+
migrations.CreateModel(
27+
name='RSSFeed',
28+
fields=[
29+
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
30+
('name', models.CharField(help_text='RSS 피드 이름', max_length=200)),
31+
('url', models.URLField(help_text='RSS 피드 URL', unique=True)),
32+
('is_active', models.BooleanField(default=True, help_text='활성화 여부')),
33+
('last_fetched', models.DateTimeField(blank=True, help_text='마지막 크롤링 시간', null=True)),
34+
('created_at', models.DateTimeField(auto_now_add=True)),
35+
('updated_at', models.DateTimeField(auto_now=True)),
36+
],
37+
options={
38+
'verbose_name': 'RSS Feed',
39+
'verbose_name_plural': 'RSS Feeds',
40+
},
41+
),
42+
migrations.CreateModel(
43+
name='CrawlingSite',
44+
fields=[
45+
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
46+
('name', models.CharField(max_length=256, verbose_name='Site Name')),
47+
('url', models.URLField(db_index=True, help_text='사이트의 기본 URL', unique=True)),
48+
('license_type', models.CharField(choices=[('MIT', 'MIT License'), ('BSD-2-Clause', 'BSD 2-Clause License'), ('Apache-2.0', 'Apache License 2.0'), ('GPL-3.0', 'GNU GPL v3'), ('LGPL-3.0', 'GNU LGPL v3'), ('MPL-2.0', 'Mozilla Public License 2.0'), ('CC0-1.0', 'CC0 1.0 Universal'), ('CC-BY-4.0', 'Creative Commons Attribution 4.0'), ('CC-BY-SA-4.0', 'Creative Commons Attribution-ShareAlike 4.0'), ('PROPRIETARY', 'Proprietary License')], default='MIT', help_text='Select the license type for the crawling source.', max_length=20)),
49+
('copyright_notice_required', models.BooleanField(default=False, help_text='저작권 고지가 필요한지 여부')),
50+
('copyright_link', models.URLField(blank=True, help_text='저작권 고지 링크 (필요한 경우)', null=True)),
51+
('created_at', models.DateTimeField(auto_now_add=True)),
52+
('updated_at', models.DateTimeField(auto_now=True)),
53+
('source', models.ForeignKey(help_text='이 사이트가 속한 크롤링 소스', on_delete=django.db.models.deletion.CASCADE, related_name='sites', to='curation.crawlingsources')),
54+
],
55+
),
56+
migrations.CreateModel(
57+
name='CrawlURL',
58+
fields=[
59+
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
60+
('url', models.URLField(unique=True, verbose_name='Crawling Article URL')),
61+
('status', models.CharField(choices=[('pending', '대기 중'), ('success', '성공'), ('failed', '실패')], default='pending', help_text='크롤링 상태', max_length=10)),
62+
('crawl_creation_date', models.DateTimeField(auto_now_add=True, help_text='크롤링이 생성된 시각(자동 기록)')),
63+
('created_at', models.DateTimeField(auto_now_add=True)),
64+
('updated_at', models.DateTimeField(auto_now=True)),
65+
('site', models.ForeignKey(help_text='이 URL이 속한 사이트', on_delete=django.db.models.deletion.CASCADE, related_name='crawl_urls', to='curation.crawlingsite')),
66+
],
67+
),
68+
migrations.CreateModel(
69+
name='CrawledContent',
70+
fields=[
71+
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
72+
('title', models.CharField(blank=True, help_text='기사 제목', max_length=512)),
73+
('content', models.TextField(help_text='크롤링 본문')),
74+
('published_at', models.DateTimeField(blank=True, help_text='원문 게시 시각', null=True)),
75+
('is_visible', models.BooleanField(default=False, help_text='라이선스/저작권 검증 결과 콘텐츠 노출 여부')),
76+
('created_at', models.DateTimeField(auto_now_add=True)),
77+
('updated_at', models.DateTimeField(auto_now=True)),
78+
('crawl_url', models.ForeignKey(help_text='원본 URL 객체', on_delete=django.db.models.deletion.CASCADE, related_name='contents', to='curation.crawlurl')),
79+
],
80+
),
81+
migrations.CreateModel(
82+
name='RSSItem',
83+
fields=[
84+
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
85+
('title', models.CharField(help_text='제목', max_length=500)),
86+
('link', models.URLField(help_text='아이템 URL', unique=True)),
87+
('description', models.TextField(blank=True, help_text='설명')),
88+
('author', models.CharField(blank=True, help_text='작성자', max_length=200)),
89+
('category', models.CharField(blank=True, help_text='카테고리', max_length=200)),
90+
('guid', models.CharField(blank=True, help_text='GUID', max_length=500, unique=True)),
91+
('pub_date', models.DateTimeField(blank=True, help_text='발행일', null=True)),
92+
('created_at', models.DateTimeField(auto_now_add=True)),
93+
('feed', models.ForeignKey(help_text='이 아이템이 속한 RSS 피드', on_delete=django.db.models.deletion.CASCADE, related_name='items', to='curation.rssfeed')),
94+
],
95+
options={
96+
'verbose_name': 'RSS Item',
97+
'verbose_name_plural': 'RSS Items',
98+
'ordering': ['-pub_date', '-created_at'],
99+
},
100+
),
101+
]

0 commit comments

Comments
 (0)