[docs]classFiji:location="Fiji"units="tests performed"source_label="Fiji Ministry of Health & Medical Services"notes=""source_url="https://www.health.gov.fj/page/"_num_max_pages=3_num_rows_per_page=3__element=Noneregex={"title":r"COVID-19 Update","year":r"\d{4}","date":r"tests have been reported for (\w+ \d+)","count":r"tests since 2020 are (\d+,\d+)",}
[docs]defread(self)->pd.Series:"""Read data from source."""data=[]forcntinrange(1,self._num_max_pages+1):url=f"{self.source_url}{cnt}/"soup=get_soup(url)for_inrange(self._num_rows_per_page):data,proceed=self._parse_data(soup)ifnotproceed:returnpd.Series(data)returnNone
[docs]def_parse_data(self,soup:BeautifulSoup)->tuple:"""Get data from the source page."""# Get relevant element listself._get_list_of_elements(soup)ifnotself.__element:returnNone,True# Get relevant element and year from element listelem,year=self._get_relevant_element_and_year()# Extract url and date from elementurl=self._parse_link_from_element(elem)# Extract text from urltext=self._get_text_from_url(url)# Extract metrics from textdate=self._parse_date_from_text(year,text)ifnotdate:returnNone,True# Extract metrics from textcount=self._parse_metrics(text)record={"source_url":url,"date":date,"count":count,}returnrecord,False
[docs]def_get_list_of_elements(self,soup:BeautifulSoup)->None:"""Get the relevant elements list from the source page."""elem_list=soup.find_all("h2")self.__element=[titlefortitleinelem_listifself.regex["title"]intitle.text]
[docs]def_get_relevant_element_and_year(self)->tuple:"""Get the relevant element and year from the element list."""elem=self.__element.pop(0)year=re.search(self.regex["year"],elem.text).group()returnelem,year
[docs]def_parse_date_from_text(self,year:str,text:str)->str:"""Get date from relevant element."""match=re.search(self.regex["date"],text)ifnotmatch:returnNonemonth_day=match.group(1)returnclean_date(f"{month_day}{year}","%B %d %Y")
[docs]def_parse_link_from_element(self,elem:element.Tag)->str:"""Get link from relevant element."""link=elem.find("a")["href"]returnlink
[docs]def_get_text_from_url(self,url:str)->str:"""Extract text from the url."""soup=get_soup(url)text=soup.get_text().replace("\n"," ").replace("\xa0","").lower()returntext
[docs]def_parse_metrics(self,text:str)->int:"""Get metrics from news text."""match=re.search(self.regex["count"],text)ifnotmatch:raiseTypeError(("Website Structure Changed, please update the script"))count=match.group(1)returnclean_count(count)
[docs]defexport(self):"""Export data to csv."""data=self.read()increment(sheet_name=self.location,country=self.location,units=self.units,date=data["date"],source_url=data["source_url"],source_label=self.source_label,count=data["count"],)