[docs]defread(self)->pd.DataFrame:"""Reads the data from the source."""soup=get_soup(self.source_url_ref)# self._read_latest(soup)link=self._parse_file_link(soup)df=read_xlsx_from_url(link,sheet_name="Date")returndf
[docs]def_read_latest(self,soup):"""Reads the latest data from the soup."""tables=pd.read_html(str(soup))latest=tables[0].set_index("Unnamed: 0")latest_kids=tables[1].set_index("Unnamed: 0")latest_date=re.search(r"Data in this section is as at 11:59pm ([\d]+ [A-Za-z]+ 20\d{2})",soup.text).group(1)self.latest=pd.DataFrame({"people_vaccinated":latest.loc["First dose","Cumulative total"]+latest_kids.loc["First dose","Cumulative total"],"people_fully_vaccinated":latest.loc["Second dose","Cumulative total"]+latest_kids.loc["Second dose","Cumulative total"],"total_boosters":latest.loc["Boosters","Cumulative total"]+latest.loc["Third primary","Cumulative total"],"date":[clean_date(latest_date,"%d %B %Y")],})
[docs]def_parse_file_link(self,soup:BeautifulSoup)->str:"""Parses the link from the soup."""href=soup.find(id="download").find_next("a")["href"]link=f"{self.base_url}{href}"returnlink
[docs]defpipe_cumsum(self,df:pd.DataFrame)->pd.DataFrame:"""Calculates cumulative sum of the columns."""df[self.columns_cumsum]=df[self.columns_cumsum].cumsum()returndf
[docs]defpipe_date(self,df:pd.DataFrame)->pd.DataFrame:"""Formats the date column."""returndf.assign(date=clean_date_series(df.date,"%d/%m/%Y"))
[docs]defpipe_boosters(self,df:pd.DataFrame)->pd.DataFrame:"""Calculates the total boosters."""returndf.assign(total_boosters=df.total_boosters+df.third_dose+df.total_boosters_2)
[docs]defpipe_latest_metrics(self,df:pd.DataFrame)->pd.DataFrame:"""pipes the latest metrics."""returndf.sort_values("date").append(self.latest,ignore_index=True).drop_duplicates("date",keep="last")
[docs]defpipe_total_vaccinations(self,df:pd.DataFrame)->pd.DataFrame:"""Calculates the total vaccinations."""returndf.assign(total_vaccinations=df.people_vaccinated+df.people_fully_vaccinated+df.total_boosters)
[docs]defpipe_vaccine(self,df:pd.DataFrame)->pd.DataFrame:"""Builds the vaccine timeline."""returnbuild_vaccine_timeline(df,self.vaccines_start_date)
[docs]defpipeline(self,df:pd.DataFrame)->pd.DataFrame:"""Pipeline for the data"""return(df.pipe(self.pipe_rename_columns).pipe(self.pipe_cumsum).pipe(self.pipe_date).pipe(self.pipe_boosters)# .pipe(self.pipe_latest_metrics).pipe(self.pipe_total_vaccinations).pipe(self.pipe_vaccine).pipe(self.pipe_metadata).pipe(self.make_monotonic))
[docs]defexport(self):"""Exports the data to CSV"""df=self.read().pipe(self.pipeline)self.export_datafile(df,valid_cols_only=True)