[docs]classVanuatu(CountryTestBase):location:str="Vanuatu"units:str="people tested"source_label:str="Ministry of Health"source_url:str="https://covid19.gov.vu/index.php/surveillance"_base_url:str="https://covid19.gov.vu"source_url_ref:str=Noneregex:dict={"title":r"Surveillance Report for Epi Week","date":r"\d{1,2}\/\d{2}\/20\d{2} - (\d{1,2}\/\d{2}\/20\d{2})",}@propertydefarea(Self)->list:""" Areas of pdf to be extracted Returns: list: [[y1, x1, y2, x2], ...] For more info see: https://github.com/tabulapdf/tabula-java/wiki/Using-the-command-line-tabula-extractor-tool """return[[122,188,140,419],[437,7,484,299]]
[docs]defread(self)->pd.DataFrame:"""Read data from source"""soup=get_soup(self.source_url)df=self._parse_data(soup)returndf
[docs]def_parse_data(self,soup:BeautifulSoup)->pd.DataFrame:"""Parse data from soup"""# Get the article URLlink=soup.find("a",text=re.compile(self.regex["title"]))["href"]ifnotlink:raiseValueError("Article not found, please update the script")self.source_url_ref=f"{self._base_url}{link}"# Parse pdf tables from linktables=self._parse_pdf_tables()# Get the metricscount=self._parse_metrics(tables)# Get the datedate=self._parse_date(tables)df=pd.DataFrame({"Date":[date],"Cumulative total":[count],})returndf
[docs]def_parse_pdf_tables(self)->list:"""Parse pdf tables from link"""tables=tabula.read_pdf(self.source_url_ref,pages="1",stream=True,area=self.area)iflen(tables)!=2:raiseValueError("PDF structure has changed, please update the script")returntables
[docs]def_parse_metrics(self,tables:list)->int:"""Parse metrics from the list of tables"""count=tables[1].loc[0,"Cumulative"]returnclean_count(count)
[docs]def_parse_date(self,tables:list)->str:"""Parse date from the list of tables"""date_str=tables[0].columns[0]returnextract_clean_date(date_str,self.regex["date"],"%d/%m/%Y")
[docs]defpipeline(self,df:pd.DataFrame)->pd.DataFrame:"""Pipeline for data processing"""returndf.pipe(self.pipe_metadata)
[docs]defexport(self):"""Export data to csv"""df=self.read().pipe(self.pipeline)self.export_datafile(df,attach=True)