[docs]classVietnam:location="Vietnam"units="people tested"source_label="Ministry of Health of Vietnam"# base_url = "https://suckhoedoisong.vn"base_url="https://covid19.gov.vn"source_url="https://covid19.gov.vn/ban-tin-covid-19.htm"regex={"title":r"Ngày","date":r"(\d{2}\-\d{2}\-\d{4})","count":r"mẫu tương đương (\d+)",}
[docs]defread(self)->pd.Series:"""Read data from source."""soup=get_soup(self.source_url)data=self._parse_data(soup)returndata
[docs]def_parse_data(self,soup:BeautifulSoup)->dict:"""Get data from the source page."""# Get relevant linkurl=self._get_relevant_link(soup)# Extract text from urltext=self._get_text_from_url(url)# Extract date from textsoup=get_soup(url)date=self._parse_date_from_text(soup)# Extract metrics from textcount=self._parse_metrics(text)record={"source_url":url,"date":date,"count":count,}returnrecord
[docs]def_get_relevant_link(self,soup:BeautifulSoup)->str:"""Get the relevant URL from the source page."""elem_list=soup.find_all("a",title=re.compile(self.regex["title"]))ifnotelem_list:raiseValueError("No relevant links found, please update the regex")href=elem_list[0]["href"]url=f"{self.base_url}{href}"returnurl
[docs]def_get_text_from_url(self,url:str)->str:"""Extract text from URL."""soup=get_soup(url)text=soup.get_text()text=re.sub(r"(\d)\.(\d)",r"\1\2",text)text=re.sub(r"\s+"," ",text)returntext
[docs]def_parse_date_from_text(self,soup)->str:"""Get date from text."""date_raw=soup.select(".detail-time div")[0].textreturnextract_clean_date(date_raw,r"(\d{2}\/\d{2}\/\d{4})","%d/%m/%Y")
[docs]def_parse_metrics(self,text:str)->int:"""Get metrics from text."""count=re.search(self.regex["count"],text).group(1)returnclean_count(count)
[docs]defexport(self):"""Export data to CSV."""data=self.read()increment(sheet_name=self.location,country=self.location,units=self.units,date=data["date"],source_url=data["source_url"],source_label=self.source_label,count=data["count"],)