-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathimport_articles.py
69 lines (52 loc) · 2.15 KB
/
import_articles.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
# -*- coding: utf-8 -*-
import csv
import urllib2
import time
import datetime
from django.core.management.base import BaseCommand
from epu_index.models import Article, NewsJournal
DELIMITER = '\t'
QUOTECHAR = "'"
def translate_journal(name):
# Key: identifier in CSV file
# Value: spîder_name in database
journals = {
'DeStandaard': 'standaard',
'DeTijd': 'detijd',
'DeRedactie': 'deredactie',
'DeMorgen': 'demorgen',
'Nieuwsblad': 'nieuwsblad',
'HLN': 'hln'
}
return journals[name]
class Command(BaseCommand):
help = 'Load CSV file to populate Article model. Journals should be loaded before so foreign\
keys are set appropriately'
def add_arguments(self, parser):
# Positional arguments
parser.add_argument('csv_url')
parser.add_argument('--truncate',
action='store_true',
dest='truncate',
default=False,
help='Truncate Article table before import.')
def handle(self, *args, **options):
csvfile = urllib2.urlopen(options['csv_url'])
reader = csv.reader(csvfile, delimiter=DELIMITER, quotechar=QUOTECHAR)
if options['truncate']:
self.stdout.write('Truncating Article data table...')
Article.objects.all().delete()
for row in reader:
published_at, journal, title, cleaned_text, epu = row
dt = datetime.datetime(*time.strptime(published_at, '%a %b %d %H %M %S %Y')[0:6])
dt_str = dt.isoformat() + '+00' # assume all datetimes are in UTC
spider_name = translate_journal(journal)
journal = NewsJournal.objects.get(spider_name=spider_name)
Article.objects.create(published_at=dt_str,
news_journal=journal,
text=cleaned_text,
cleaned_text=cleaned_text,
title=title,
epu_score=float(epu))
self.stdout.write('.', ending="")
self.stdout.flush()