-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathname_basics_data.py
38 lines (32 loc) · 1.53 KB
/
name_basics_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
"""Clean non-commercial data from names.basics.tsv from IMDb site and prepare for loading into a
database."""
from imdb_data import IMDbData
class NameBasicsData(IMDbData):
"""Class for cleaning and storing IMDb data in name.basics.tsv"""
# name.basics.tsv
# nconst (string) - alphanumeric unique identifier of the name/person
# primaryName (string)– name by which the person is most often credited
# birthYear – in YYYY format
# deathYear – in YYYY format if applicable, else '\N'
# primaryProfession (array of strings)– the top-3 professions of the person
# knownForTitles (array of tconst values) – titles the person is known for
def __init__(self, init_df):
df_name = "name_basics"
super().__init__(init_df, df_name)
self.desired_columns = [
"primaryName",
"birthYear",
"deathYear",
]
# Assign an index to the DataFrame
self.data_frames[df_name].set_index("nconst", inplace=True)
# Explode one compound columns and store as a new table.
self.data_frames["known_for_titles"] = self.explode_columns(
self.data_frames[df_name], "knownForTitles"
)
# The other compound column is split into multiple columns
self.data_frames[df_name] = self.split_columns(
self.data_frames[df_name], ["primaryProfession"]
)
# Drop the knownForTitles column from the name_basics data-frame
self.data_frames[df_name] = self.data_frames[df_name][self.desired_columns]