import scrapy
from urllib.parse import urljoin, urlparse
from scrapy.linkextractors import LinkExtractor
from scrapy.linkextractors import LinkExtractor
from twisted.internet.error import DNSLookupError, TimeoutError, TCPTimedOutError
from scrapy.spidermiddlewares.httperror import HttpError
import os
import requests
import logging
import csv
from datetime import datetime
import tldextract


logger = logging.getLogger(__name__)


class multibankSpider(scrapy.Spider):
    name = 'multibank_sample'

    custom_settings = {
        'ITEM_PIPELINES': {'singlebank.custom_files_pipeline.CustomFilesPipeline': 1},
        'FILES_STORE': 'your_file_path/data_artifact/crawler_and_data/crawled_files', # TODO: Update this path
        'LOG_LEVEL': 'DEBUG',
        'LOG_FILE': 'multibank_log.txt'
    }

    success_crawls = 0
    success_downloads = 0
    failed_crawls = 0
    failed_downloads = 0
    failed_requests = []
    allowed_domains = set()

    def open(self):
        self.start_time = datetime.now()
        return super().open()

    def start_requests(self):
        csv_file_path = 'crawler_sample.csv'

        with open(csv_file_path, mode='r', encoding='utf-8-sig') as csvfile:
            reader = csv.DictReader(csvfile)
            for row in reader:
                url = row['URL']
                id_ = row['ID']
                parsed_url = urlparse(url)
                domain_info = tldextract.extract(parsed_url.netloc)
                base_domain = f"{domain_info.domain}.{domain_info.suffix}"
                self.allowed_domains.update([parsed_url.netloc, base_domain])
                # self.allowed_domains.add(parsed_url.netloc)
                yield scrapy.Request(
                    url,
                    callback=self.landing_parse,
                    errback=self.errback_httpbin,
                    meta={'id': id_, 'original_domain': parsed_url.netloc},
                    dont_filter=True
                )

    landing_keywords = ['privacy']
    depth1_keywords = ['notice', 'us consumer', 'u.s. consumer', 'consumer', 'statement', 'disclosure', 
                       'glba', 'policy', 'policies', 'california', 'ca residents', 'ca resident', 
                       'ccpa', 'online privacy', 'internet privacy', 'privacy', 'cookie', 'online policy',
                       'download', 'print', 'printer friendly version', 'pdf']
    depth2_keywords = depth1_keywords
    pdf_links = set()
    html_links = set()

    def filter_hash_links(self, links, page_url):
        filtered_links = []
        page_url_base = self.strip_fragment(page_url).rstrip("/")
        for link in links:
            link_url_base = self.strip_fragment(link.url).rstrip("/")
            if link_url_base != page_url_base:
                filtered_links.append(link)
        return filtered_links

    
    def strip_fragment(self, url):
        # Remove URL fragment
        return url.split('#')[0]

    def landing_parse(self, response):
        id_ = response.meta['id']
        original_domain = response.meta['original_domain']
        final_domain = urlparse(response.url).netloc
        self.allowed_domains.add(final_domain)

        privacy_link_from_landing_page = set()
        depth0_pdf_links = set()
        depth0_html_links = set()

        link_extractor = LinkExtractor(deny_domains=['fdic.gov','ftc.gov','apple.com','get.adobe.com','consumerfinance.gov','consumerfed.org','google.com'], deny_extensions=[], tags='a', attrs='href')
        links = link_extractor.extract_links(response)
        non_hash_links = self.filter_hash_links(links, response.url)
        for link in non_hash_links:
            link_to_save = self.strip_fragment(link.url).rstrip("/")
            if any(keyword in link.text.lower() or keyword in link.url.lower() for keyword in self.landing_keywords):
                if link_to_save not in self.pdf_links and link_to_save not in self.html_links:
                    privacy_link_from_landing_page.add(link_to_save)
                    if self.is_pdf_url(link.url):
                        self.pdf_links.add(link_to_save)
                        depth0_pdf_links.add(link_to_save)
                        self.save_pdf(link_to_save, id_)
                    else:
                        self.html_links.add(link_to_save)
                        depth0_html_links.add(link_to_save)
        
        if depth0_pdf_links or depth0_html_links:
            self.success_crawls += 1
            yield {
                'ID': id_,
                'privacy_link_from_landing_page': list(privacy_link_from_landing_page),
                'depth_0_pdf_links': list(depth0_pdf_links),
                'depth_0_html_links': list(depth0_html_links)
            }

        # Follow html links to parse content
        for link in depth0_html_links:
            yield response.follow(link, callback=self.depth1_parse, errback=self.errback_httpbin, meta={'id': response.meta['id']})

    def depth1_parse(self, response):
        id_ = response.meta['id']
        filename = self.get_safe_filename(response.url) + '.html'
        self.save_html(response.body, filename, response.meta['id'])

        depth1_pdf_links = set()
        depth1_html_links = set()

        link_extractor = LinkExtractor(deny_domains=['fdic.gov','ftc.gov','apple.com','get.adobe.com','consumerfinance.gov','consumerfed.org','google.com'], deny_extensions=[], tags='a', attrs='href')
        links = link_extractor.extract_links(response)
        non_hash_links = self.filter_hash_links(links, response.url)
        for link in non_hash_links:
            link_to_save = self.strip_fragment(link.url.rstrip("/"))
            if any(keyword in link.text.lower() or keyword in link.url.lower() for keyword in self.depth1_keywords):
                if self.is_pdf_url(link.url):
                    if link_to_save not in self.pdf_links:
                        self.pdf_links.add(link_to_save)
                        depth1_pdf_links.add(link_to_save)
                        self.save_pdf(link_to_save, id_)
                elif link_to_save not in self.html_links:
                    self.html_links.add(link_to_save)
                    depth1_html_links.add(link_to_save)
        
        if depth1_pdf_links or depth1_html_links:
            self.success_crawls += 1
            yield {
                'ID': id_,
                'depth_1_html_saved': response.url,
                'depth_1_pdf_links': list(depth1_pdf_links),
                'depth_1_html_links': list(depth1_html_links),
            }
        # Follow html links to parse content
        for link in depth1_html_links:
            yield response.follow(link, callback=self.depth2_parse, errback=self.errback_httpbin, meta={'id': response.meta['id']})
        


    def depth2_parse(self, response):
        id_ = response.meta['id']
        filename = self.get_safe_filename(response.url) + '.html'
        self.save_html(response.body, filename, response.meta['id'])

        depth2_pdf_links = set()
        depth2_html_links = set()

        link_extractor = LinkExtractor(deny_domains=['fdic.gov','ftc.gov','apple.com','get.adobe.com','consumerfinance.gov','consumerfed.org','google.com'], deny_extensions=[], tags='a', attrs='href')
        links = link_extractor.extract_links(response)
        non_hash_links = self.filter_hash_links(links, response.url)
        for link in non_hash_links:
            link_to_save = self.strip_fragment(link.url.rstrip("/"))
            if any(keyword in link.text.lower() or keyword in link.url.lower() for keyword in self.depth1_keywords):
                if self.is_pdf_url(link.url):
                    if link_to_save not in self.pdf_links:
                        self.pdf_links.add(link_to_save)
                        depth2_pdf_links.add(link_to_save)
                        self.save_pdf(link_to_save, id_)
                elif link_to_save not in self.html_links:
                    self.html_links.add(link_to_save)
                    depth2_html_links.add(link_to_save)
        
        if depth2_pdf_links or depth2_html_links:
            self.success_crawls += 1
            yield {
                'ID': id_,
                'depth_2_html_saved': response.url,
                'depth_2_pdf_links': list(depth2_pdf_links),
                'depth_2_html_links': list(depth2_html_links),
            }

    def get_safe_filename(self, url):
        return url.replace('http://', '').replace('https://', '').replace('/', '_').replace(':', '_').replace('?', '_')
    
    def save_html(self, html_content, filename, id_):
        directory_path = os.path.join(multibankSpider.custom_settings['FILES_STORE'], str(id_))
        os.makedirs(directory_path, exist_ok=True)
        file_path = os.path.join(directory_path, filename)
        with open(file_path, 'wb') as file:
            file.write(html_content)

    def save_pdf(self, url, id_):
        directory_path = os.path.join(multibankSpider.custom_settings['FILES_STORE'], str(id_))
        os.makedirs(directory_path, exist_ok=True)
        filename = self.get_safe_filename(url) + '.pdf'
        file_path = os.path.join(directory_path, filename)
        try:
            content = requests.get(url).content
            with open(file_path, 'wb') as file:
                file.write(content)
            self.success_downloads += 1
        except Exception as e:
            self.failed_downloads += 1
            self.failed_requests.append({
                'ID': id_,
                'url': url,
                'error': str(e)
            })
            logger.error(f"Error downloading {url} with ID: {id_}: {str(e)}")

    def is_pdf_url(self, url):
        """Determine if a URL points to a PDF file based on Content-Type."""
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:91.0) Gecko/20100101 Firefox/91.0'
        }
        try:
            response = requests.head(url, allow_redirects=True, headers=headers, timeout=10)
            if response.headers.get('Content-Type', '').lower() == 'application/pdf':
                return True
        except requests.RequestException as e:
            logger.error(f"Error checking PDF URL {url}: {str(e)}")
        return False
    
    def errback_httpbin(self, failure):
        id_value = failure.request.meta.get('id', 'unknown')
        error_info = {
            'ID': id_value,
            'url': failure.request.url,
            'error': str(failure)
        }

        if failure.check(HttpError):
            response = failure.value.response
            self.logger.error(f'HttpError on {response.url} with ID: {id_value}')
            error_info['error'] = f'HttpError: {response.status}'
        elif failure.check(DNSLookupError):
            self.logger.error(f'DNSLookupError on {failure.request.url} with ID: {id_value}')
            error_info['error'] = 'DNSLookupError'
        elif failure.check(TimeoutError, TCPTimedOutError):
            self.logger.error(f'TimeoutError on {failure.request.url} with ID: {id_value}')
            error_info['error'] = 'TimeoutError'
        else:
            self.logger.error(f'OtherError: {failure} with ID: {id_value}')

        self.failed_crawls += 1
        self.failed_requests.append(error_info)

    def closed(self, reason):
        total_requests = self.success_crawls + self.failed_crawls
        success_crawl_rate = self.success_crawls / total_requests if total_requests else 0
        total_downloads = self.success_downloads + self.failed_downloads
        success_download_rate = self.success_downloads / total_downloads if total_downloads else 0

        logger.info(f"--- CRAWL SUMMARY ---")
        start_time = getattr(self, 'start_time', None)
        if start_time:
            elapsed_time = datetime.now() - start_time
            logger.info(f"Elapsed Time: {elapsed_time}")
        else:
            logger.info("Elapsed Time: Start time not set")
        logger.info(f"Total Crawled: {total_requests}")
        logger.info(f"Successful Crawls: {self.success_crawls}")
        logger.info(f"Failed Crawls: {self.failed_crawls}")
        logger.info(f"Success Crawl Rate: {success_crawl_rate:.2%}")
        logger.info(f"Total Downloads: {total_downloads}")
        logger.info(f"Successful Downloads: {self.success_downloads}")
        logger.info(f"Failed Downloads: {self.failed_downloads}")
        logger.info(f"Success Download Rate: {success_download_rate:.2%}")

        if self.failed_requests:
            logger.info("--- DETAILED ERROR INFO ---")
            for failure in self.failed_requests:
                logger.info(f"ID: {failure['ID']}, URL: {failure['url']}, Error: {failure['error']}")