[docs]classRussia:location="Russia"units="tests performed"source_label="Government of the Russian Federation"notes=""_base_url="https://rospotrebnadzor.ru"_url_subdirectory="/about/info/news/?PAGEN_1="_num_max_pages=3regex={"title":r"Информационный бюллетень о ситуации","date":r"(\d+ \d+ \d+)","count":r"проведено (\d+).* исследовани",}
[docs]def_parse_data(self,soup:BeautifulSoup)->tuple:"""Get data from the source page."""# Get relevant elementelem=self._get_relevant_element(soup)ifnotelem:returnNone,True# Extract url from elementurl=self._get_link_from_element(elem)# Extract text from urltext,date=self._get_text_and_date_from_url(url)count=self._parse_metrics(text)record={"source_url":url,"date":date,"count":count,}returnrecord,False
[docs]def_get_relevant_element(self,soup:BeautifulSoup)->element.Tag:"""Get the relevant element in news feed."""news_list=soup.find(class_="content").find(class_="page").find_all("a")url_idx=[ifori,newsinenumerate(news_list)ifself.regex["title"]innews.text]ifnoturl_idx:returnNonereturnnews_list[url_idx[0]]
[docs]def_get_text_and_date_from_url(self,url:str)->tuple:"""Extract text from the url."""soup=get_soup(url)date=self._parse_date(soup)text=soup.find(class_="news-detail").text.replace("\n"," ").replace("\xa0","")text=re.sub(r'(\d)\s+(\d)',r'\1\2',text)returntext,date
[docs]def_parse_date(self,soup:BeautifulSoup)->str:"""Get date from relevant element."""date_text=soup.find(class_="date").text.replace("."," ")date=re.search(self.regex["date"],date_text).group()returnclean_date(date,"%d %m %Y")
[docs]def_get_link_from_element(self,elem:element.Tag)->str:"""Get link from relevant element."""href=elem["href"]link=f"{self._base_url}{href}"returnlink
[docs]def_parse_metrics(self,text:str)->int:"""Get metrics from news text."""count=int(re.search(self.regex["count"],text).group(1))returnclean_count(count)