[docs]def_parse_data(self,soup:BeautifulSoup)->tuple:"""Get data from the source page."""# Get relevant elementelem=self._get_relevant_element(soup)ifnotelem:returnNone,True# Extract url and date from elementurl,date=self._get_link_and_date_from_element(elem)# Extract text from urltext=self._get_text_from_url(url)daily_change=self._parse_metrics(text)record={"source_url":url,"date":date,"daily_change":daily_change,}returnrecord,False
[docs]def_get_relevant_element(self,soup:BeautifulSoup)->element.Tag:"""Get the relevant element in news feed."""news_list=soup.find_all("a",class_="node-teaser-title")url_idx=[ifori,newsinenumerate(news_list)ifre.search(self.regex["title"],news.text)]ifnoturl_idx:returnNonereturnnews_list[url_idx[0]]
[docs]def_get_text_from_url(self,url:str)->str:"""Extract text from the url."""soup=get_soup(url)text=soup.find("div",class_="row bodytext").get_text(strip=True).replace(",","")returntext
[docs]def_get_link_and_date_from_element(self,elem:element.Tag)->tuple:"""Extract link and date from relevant element."""link=self._parse_link_from_element(elem)ifnotlink:returnNonedate=self._parse_date_from_element(elem)returnlink,date
[docs]def_parse_date_from_element(self,elem:element.Tag)->str:"""Get date from relevant element."""date_tag=elem.findNextSibling("div",class_="node-teaser-time")date=re.search(self.regex["date"],date_tag.text).group()returnclean_date(date,"%d %b %Y")
[docs]def_parse_link_from_element(self,elem:element.Tag)->str:"""Get link from relevant element."""href=elem["href"]link=f"{self._base_url}{href}"returnlink
[docs]def_parse_metrics(self,text:str)->int:"""Get metrics from news text."""count=int(re.search(self.regex["count"],text).group(1))returnclean_count(count)