forked from weecology/sad-comparison
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathconnolly_data_extraction.py
38 lines (32 loc) · 1.76 KB
/
connolly_data_extraction.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
"""Export data in proper format for Connolly"""
import os
import pandas as pd
def import_data(datasets, datadir):
"""Import data files from the ./data directory"""
data = pd.DataFrame()
for dataset in datasets:
print "Importing {} data".format(dataset)
datafile = os.path.join(datadir, dataset + '_spab.csv')
new_data = pd.read_csv(datafile, comment='#', usecols=['site_ID', 'species', 'abundance'])
new_data = new_data[new_data['abundance'] > 0]
new_data.insert(0, 'dataset', dataset)
data = data.append(new_data, ignore_index=True)
return data
def filter_data_minS(data, minS):
"""Only keep data with S>=minS for analysis"""
return data.groupby(['dataset', 'site_ID']).filter(lambda x: len(x) >= minS)
datasets = ['Actinopterygii_morphos', 'Amphibia', 'Arachnida_morphos', 'bbs', 'cbc', 'Coleoptera',
'fia', 'gentry', 'mcdb', 'naba', 'Reptilia_morphos']
data = import_data(datasets, './sad-data/chapter3/')
data = filter_data_minS(data, minS=5)
data_by_dataset = data.groupby(['dataset'])
for dataset, dataset_data in data_by_dataset:
dataset_dedupped = dataset_data.drop_duplicates()
if len(dataset_data) != len(dataset_dedupped):
print("{} had {} duplicate site-species-abundance combinations out of {} records".format(dataset, len(dataset_data) - len(dataset_dedupped), len(dataset_data)))
dataset_data = dataset_dedupped
try:
pivoted_data = dataset_data.pivot(index='species', columns='site_ID', values='abundance')
pivoted_data.to_csv('./sad-data/chapter3/connolly_data/{}_pivoted_spab.csv'.format(dataset), float_format='%.0f')
except ValueError:
print("{} had duplicate site-species combinations".format(dataset))