Add extra values that should be read as NA and counts with spaces in them

cchuong · cchuong · commit 9d2291140d47 · 2025-04-25T10:33:16.000-07:00
diff --git a/src/acquisition/rvdss/pull_historic.py b/src/acquisition/rvdss/pull_historic.py
@@ -237,6 +237,7 @@ def create_detections_table(table,modified_date,week_number,week_end_date,start_
 
 def create_number_detections_table(table,modified_date,start_year):
     week_columns = table.columns.get_indexer(table.columns[~table.columns.str.contains('week')])
+    table = table.apply(lambda x: x.replace(r'\s', '', regex=True).astype('int'))
 
     for index in week_columns:
         new_name = abbreviate_virus(table.columns[index]) + " positive_tests"
diff --git a/tests/acquisition/rvdss/test_pull_historic.py b/tests/acquisition/rvdss/test_pull_historic.py
@@ -226,7 +226,8 @@ def test_create_detections_table(self):
         tab.tfoot.decompose()
         tab = re.sub(",",r".",str(tab))
             
-        na_values = ['N.A.','N.A', 'N.C.','N.R.','Not Available','Not Tested',"not available","not tested","N.D.","-"]
+        na_values = ['N.A.','N.A', 'N.C.','N.R.','Not Available','Not Tested',"not available",
+                     "not tested","N.D.","-",'Not tested','non testé']
         table =  pd.read_html(tab,na_values=na_values)[0].dropna(how="all")
         table.columns=table.columns.str.lower()
         table = drop_ah1_columns(table)
@@ -260,7 +261,8 @@ def test_create_number_detections_table(self):
         tab = caption.find_next('table')
         tab = re.sub(",","",str(tab))
                 
-        na_values = ['N.A.','N.A', 'N.C.','N.R.','Not Available','Not Tested',"not available","not tested","N.D.","-"]
+        na_values = ['N.A.','N.A', 'N.C.','N.R.','Not Available','Not Tested',"not available",
+                     "not tested","N.D.","-",'Not tested','non testé']
         table =  pd.read_html(tab,na_values=na_values)[0].dropna(how="all")
         table.columns=table.columns.str.lower()
         table = drop_ah1_columns(table)
@@ -295,8 +297,8 @@ def test_create_percent_positive_detection_table(self):
         expected_rsvdata = expected_rsvdata.sort_values(by=['epiweek', 'time_value', 'issue', 'geo_type', 'geo_value'])
         
         # get tables from raw html and process before testing the function
-        na_values = ['N.A.','N.A', 'N.C.','N.R.','Not Available','Not Tested',"not available","not tested","N.D.","-"]
-        
+        na_values = ['N.A.','N.A', 'N.C.','N.R.','Not Available','Not Tested',"not available",
+                     "not tested","N.D.","-",'Not tested','non testé']
         flu_caption=[t for t in captions if "Influenza" in t.text][0]
         flu_tab = flu_caption.find_next('table')
         flu_tab = re.sub(",","",str(flu_tab))