Source code for cowidev.testing.incremental.vanuatu

import re

from bs4 import BeautifulSoup
import pandas as pd
import tabula

from cowidev.utils import get_soup, clean_count
from cowidev.utils.clean import extract_clean_date
from cowidev.testing import CountryTestBase



[docs]
class Vanuatu(CountryTestBase):
    location: str = "Vanuatu"
    units: str = "people tested"
    source_label: str = "Ministry of Health"
    source_url: str = "https://covid19.gov.vu/index.php/surveillance"
    _base_url: str = "https://covid19.gov.vu"
    source_url_ref: str = None
    regex: dict = {
        "title": r"Surveillance Report for Epi Week",
        "date": r"\d{1,2}\/\d{2}\/20\d{2} - (\d{1,2}\/\d{2}\/20\d{2})",
    }

    @property
    def area(Self) -> list:
        """
        Areas of pdf to be extracted
        Returns:
            list: [[y1, x1, y2, x2], ...]
        For more info see: https://github.com/tabulapdf/tabula-java/wiki/Using-the-command-line-tabula-extractor-tool
        """
        return [[122, 188, 140, 419], [437, 7, 484, 299]]


[docs]
    def read(self) -> pd.DataFrame:
        """Read data from source"""
        soup = get_soup(self.source_url)
        df = self._parse_data(soup)
        return df



[docs]
    def _parse_data(self, soup: BeautifulSoup) -> pd.DataFrame:
        """Parse data from soup"""
        # Get the article URL
        link = soup.find("a", text=re.compile(self.regex["title"]))["href"]
        if not link:
            raise ValueError("Article not found, please update the script")
        self.source_url_ref = f"{self._base_url}{link}"
        # Parse pdf tables from link
        tables = self._parse_pdf_tables()
        # Get the metrics
        count = self._parse_metrics(tables)
        # Get the date
        date = self._parse_date(tables)
        df = pd.DataFrame(
            {
                "Date": [date],
                "Cumulative total": [count],
            }
        )
        return df



[docs]
    def _parse_pdf_tables(self) -> list:
        """Parse pdf tables from link"""
        tables = tabula.read_pdf(self.source_url_ref, pages="1", stream=True, area=self.area)
        if len(tables) != 2:
            raise ValueError("PDF structure has changed, please update the script")
        return tables



[docs]
    def _parse_metrics(self, tables: list) -> int:
        """Parse metrics from the list of tables"""
        count = tables[1].loc[0, "Cumulative"]
        return clean_count(count)



[docs]
    def _parse_date(self, tables: list) -> str:
        """Parse date from the list of tables"""
        date_str = tables[0].columns[0]
        return extract_clean_date(date_str, self.regex["date"], "%d/%m/%Y")



[docs]
    def pipeline(self, df: pd.DataFrame) -> pd.DataFrame:
        """Pipeline for data processing"""
        return df.pipe(self.pipe_metadata)



[docs]
    def export(self):
        """Export data to csv"""
        df = self.read().pipe(self.pipeline)
        self.export_datafile(df, attach=True) 





[docs]
def main():
    Vanuatu().export()