[docs]classSyria(CountryTestBase):location:str="Syria"units:str="tests performed"source_label:str="WHO Syrian Arab Republic"source_url:str=("https://reliefweb.int/updates?advanced-search=%28C226%29_%28S1275%29_%28DT4642%29_%28T4595%29_%28F10%29")regex:dict={"date":r"(\d{1,2} \w+ 20\d{2})","count":r"Total Test (\d+) (\d+)"}
[docs]defread(self)->pd.Series:"""Read data from source."""data=[]soup=get_soup(self.source_url)data=self._parse_data(soup)returnpd.Series(data)
[docs]def_parse_data(self,soup:BeautifulSoup)->tuple:"""Parses data from soup"""# Get relevant elementelem=self._get_relevant_element(soup)ifnotelem:returnNone,True# Extract url and date from elementpdf_url,date=self._get_link_and_date_from_element(elem)# Extract texttext=self._parse_pdf_url(pdf_url)# Get metrics from textcount=self._parse_metrics(text)record={"source_url":pdf_url,"date":date,"count":count,}returnrecord,False
[docs]def_get_relevant_element(self,soup:BeautifulSoup)->element.Tag:"""Parses pdf url from soup"""elem=soup.find_all("h4","title")[0].find("a")ifnotelem:raiseValueError("Element not found, please check the source")returnelem
[docs]def_get_link_and_date_from_element(self,elem:element.Tag)->tuple:"""Extract link and date from relevant element."""pdf_url=self._parse_link_from_element(elem)date=self._parse_date_from_element(elem)returnpdf_url,date
[docs]def_parse_link_from_element(self,elem:element.Tag)->str:"""Get link from relevant element."""link=get_soup(elem["href"]).find("a",download=True)["href"]returnlink
[docs]def_parse_date_from_element(self,elem:element.Tag)->str:"""Get date from relevant element."""returnextract_clean_date(elem.text,regex=self.regex["date"],date_format="%d %B %Y")
[docs]def_parse_pdf_url(self,pdf_url:str)->str:"""Parses pdf text"""withtempfile.NamedTemporaryFile()astmp:download_file_from_url(pdf_url,tmp.name)withopen(tmp.name,"rb")asf:text=extract_text(f)text=text.replace("\n","")returntext
[docs]def_parse_metrics(self,text:str)->int:"""Get metrics from report text."""returnclean_count(int(re.search(self.regex["count"],text).group(1)+re.search(self.regex["count"],text).group(2)))
[docs]defexport(self):"""Export data to csv."""data=self.read()[0]increment(sheet_name=self.location,country=self.location,units=self.units,date=data["date"],source_url=data["source_url"],source_label=self.source_label,count=data["count"],)