import re as regex
from urlparse import urlparse
import urllib
import urllib2
import json as json_lib
import pickle
import datetime
from itertools import chain


class Surly(object):
    SURLY_VERSION = '2.0'
    SURLY_API_HOST = 'surdotly.com'
    SURLY_DEFAULT_TOOLBAR_ID = 'AA000014'
    SURLY_API_ROOT_STATUS_PATH = '/root_status'
    SURLY_API_PATH = '/shorten'
    SURLY_API_TRACK_PATH = '/track/'
    SURLY_API_GET_TOOLBAR_ID = '/get_settings_id/'
    SURLY_API_IFRAME_LOGIN = '/iframe_login/'
    SURLY_PANEL_HOST = 'sur.ly'
    SURLY_BACKUP_PANEL_HOST = 'surdotly.com'
    SURLY_API_TIMEOUT = 0.6

    def __init__(self, toolbar_id=SURLY_DEFAULT_TOOLBAR_ID, use_shortener=False, host=None):
        """
        :param toolbar_id: str
        :param host: str
        :param use_shortener: bool
        """
        self.surly_api_ua = 'surly_api_caller python2'
        if isinstance(host, str) or isinstance(host, unicode):
            self.surly_api_ua = '%s (running %s)' % (self.surly_api_ua, host.strip())

        self.whitelist = [self.SURLY_PANEL_HOST, self.SURLY_BACKUP_PANEL_HOST]
        self.use_shortener = False
        self.shortener_cache = {}
        self.timeout = self.SURLY_API_TIMEOUT
        self.api_host = self.SURLY_API_HOST
        self.api_path = self.SURLY_API_PATH
        self.api_root_status_path = self.SURLY_API_ROOT_STATUS_PATH
        self.panel_host = self.SURLY_PANEL_HOST
        self.is_root_domain_alive = True

        self.toolbar_id = toolbar_id

        self.regex_whitelist = regex.compile(r"^(https?://)?(www.)?")
        self.regex_shortener = regex.compile(r"(?is)(<\s*a[^>]*\s+href=([\"\']?))https?://([^\s\"\'>\.]+\.[^\s\"\'>]+)")
        self.regex_process = regex.compile(r"(?is)(<\s*a[^>]*\s+href=([\"\']?))(https?://)([^\s\"\'>\.]+\.[^\s\"\'>]+)")
        self.regex_normalize_www = regex.compile(r"^www\.")
        self.regex_normalize = regex.compile(r"#.*$")
        self.regex_smart_url_encode = regex.compile(r"(?i)^(https?://)?(www\.)?([^/]+/?)(.*?)$")
        self.regex_process_url = regex.compile(r"(?i)^(https?://)([^\s\"\'>\.]+\..+)$")

        if use_shortener:
            self._enable_shortener()

    def process(self, html, encoding='utf-8'):
        """
        process the whole html and get it back with replaced urls. If html is not unicode, then
        we try to decode if from given unicode param
        :param encoding: str
        :param html: unicode|str
        :return: str
        """
        if not self._is_root_domain_alive():
            self._use_backup_domain()

        if self.use_shortener:
            links = []
            for match in self.regex_shortener.finditer(html):
                link_components = urlparse('http://' + match.group(3))
                if not self._is_whitelisted(link_components.hostname.lower()):
                    links.append(match.group(3))

            self._shorten(links)

        def process_shortener_callback(match):
            """
            process found MAtch object and return final surly url to be replaced
            :param match: Match
            :return: str
            """
            prefix, scheme, link = (match.group(1), match.group(3), match.group(4))
            normalized_link = self._normalize_url(link)

            if not normalized_link:
                normalized_link = link

            linkComponents = urlparse('http://' + normalized_link)
            host = linkComponents.hostname.lower()

            if self._is_whitelisted(host):
                return prefix + scheme + link

            link = self.regex_normalize.sub('', link)

            if self.use_shortener and normalized_link in self.shortener_cache:
                link = self.shortener_cache[normalized_link]

            link = 'o/' + self._smart_url_encode(link)

            if self.toolbar_id:
                link += '/' + self.toolbar_id

            return prefix + scheme + self.panel_host + '/' + link

        return self.regex_process.sub(
            process_shortener_callback,
            (isinstance(html, unicode) and [html] or [self._decode_html(html, encoding)])[0]
        )

    def process_url(self, url):
        """
        Processes an url. Replaces it with url pointing to Sur.ly interstitial page
        according to initialization parameters
        :param url: str
        :return: str
        """
        if not self._is_root_domain_alive():
            self._use_backup_domain()

        matches = self.regex_process_url.match(url)

        if not matches:
            return url

        scheme, link = (matches.group(1), matches.group(2))

        link_components = urlparse("http://" + link)

        if self._is_whitelisted(link_components.hostname.lower()):
            return url

        if self.use_shortener:
            self._shorten([link])
            normalized_link = self._normalize_url(link)

            if not normalized_link:
                normalized_link = self.regex_normalize.sub('', link)

            if normalized_link in self.shortener_cache:
                link = self.shortener_cache[normalized_link]

        link = 'o/' + self._smart_url_encode(link)

        if self.toolbar_id:
            link += '/' + self.toolbar_id

        return scheme + self.panel_host + '/' + link

    def process_multiple_urls(self, urls):
        """
        Processes array of urls. Replaces them with urls pointing to Sur.ly interstitial page
        according to initialization parameters. Return urls in exactly the same order as receive
        :param urls: list
        :return: list
        """
        if not self._is_root_domain_alive():
            self._use_backup_domain()

        processed_urls = {}
        processed_urls_with_scheme = {}

        for index, url in enumerate(urls):
            matches = self.regex_process_url.match(url)
            if not matches:
                processed_urls[index] = url
                continue
            scheme, link = (matches.group(1), matches.group(2))
            link_components = urlparse('http://' + link)

            if self._is_whitelisted(link_components.hostname.lower()):
                processed_urls[index] = url
                continue

            processed_urls_with_scheme[index] = {
                'scheme': scheme,
                'link': link
            }

        if self.use_shortener:
            self._shorten([link_components['link'] for index, link_components in processed_urls_with_scheme.items()])

        for index, link_with_scheme in processed_urls_with_scheme.items():
            if self.use_shortener:
                normalized_link = self._normalize_url(link_with_scheme['link'])

                if not normalized_link:
                    normalized_link = self.regex_normalize.sub('', link_with_scheme['link'])

                if normalized_link in self.shortener_cache:
                    link_with_scheme['link'] = self.shortener_cache[normalized_link]

            link = 'o/' + self._smart_url_encode(link_with_scheme['link'])

            if self.toolbar_id:
                link += '/' + self.toolbar_id

            link = link_with_scheme['scheme'] + self.panel_host + '/' + link

            processed_urls[index] = link

        # we need to keep urls order as passed to the function
        return [processed_urls[index] for index in range(len(processed_urls))]

    def _decode_html(self, html, encoding='utf-8'):
        """
        Try to decode html into unicode string from given encoding
        :param html:
        :param encoding:
        :return:
        """
        try:
            return html.decode(encoding=encoding)
        except UnicodeError:
            try:
                return html.decode(encoding='cp1251')
            except UnicodeError:
                return html.decode(encoding='koi8_r')

    def _get_cached_root_status(self):
        """
        Gets service status from the local store
        You should override this method in your subclass.

        It should retrieve cached service status from local store, that was saved
        there by cacheRootDomainAliveInfo method.

        :return: dict
        """
        return {}

    def _check_is_root_domain_alive_remotely(self):
        """
        Performs the call to status API
        :return: bool
        """
        return not self._perform_request(
            self.api_host + self.api_root_status_path,
        ) == "BAD"

    def _cache_root_status(self, root_domain_alive_info):
        """
        Caches service status into the local store
        You should override this method in your subclass.
        It should cache received string value into the local store like DB or Memcache
        :param root_domain_alive_info: dict
        """
        pass

    def _is_root_domain_alive(self):
        """
        Checks the status of the main domain sur.ly

        Also handles caching logic on object-level and db-level
        :return: bool
        """
        if self.is_root_domain_alive is None:
            root_domain_alive_info = self._get_cached_root_status()

            if not len(root_domain_alive_info) == 0:
                try:
                    root_domain_alive_info = pickle.loads(root_domain_alive_info)
                except pickle.PickleError:
                    pass

            if not isinstance(root_domain_alive_info, dict) \
                or len(root_domain_alive_info) == 0 \
                or root_domain_alive_info['last_check'] < datetime.date.today() + datetime.timedelta(1):

                self.is_root_domain_alive = self._check_is_root_domain_alive_remotely()
                self._cache_root_status(pickle.dumps({
                    "last_check": datetime.date.today(),
                    "is_alive": self.is_root_domain_alive
                }))
            else:
                self.is_root_domain_alive = root_domain_alive_info['is_alive']

        return self.is_root_domain_alive

    def _enable_shortener(self):
        """
        Enable remote shortener service usage
        """
        self.use_shortener = True

    def add_to_whitelist(self, domain):
        """
        Add domain to whitelist which will be not processed and not replaced
        :param domain: str
        """
        if isinstance(domain, str) or isinstance(domain, unicode):
            self.whitelist.append(self.regex_whitelist.sub('', domain).lower())

    def _use_backup_domain(self):
        """
        Enable backup domain usage
        """
        self.panel_host = self.SURLY_BACKUP_PANEL_HOST

    def _is_whitelisted(self, domain):
        """
        Check if domain in whitelist
        :param domain:
        :return:
        """
        domain_length = len(domain)
        for whitelist_domain in self.whitelist:
            # we also need to exclude subdomains
            if whitelist_domain == domain \
                    or domain.rfind('.' + whitelist_domain, domain_length - len('.' + whitelist_domain)) >= 0:
                return True
        return False

    def _normalize_url(self, url):
        """
        Normalize url for replacement
        :param url:
        :return:
        """
        url = self.regex_normalize_www.sub('', url)
        url = self.regex_normalize.sub('', url)
        return url

    def _get_cached_short_ids(self, urls):
        """
        Gets the shortlink identifiers from the local store.

        This method accepts a collection of target URLs (without "http://www" prefix)
        and returns dict containing a key-value pairs
        of target URLs and their corresponding IDs found in cache.
        :param urls: list
        :return: dict
        """
        return {}

    def _build_query_for_server_recursively(self, params):
        """
        URL-encode a multidimensional dictionary.
        >>> data = {'a': 'b&c', 'd': {'e': {'f&g': 'h*i'}}, 'j': 'k'}
        >>> recursive_urlencode(data)
        u'a=b%26c&j=k&d[e][f%26g]=h%2Ai'

        :param params: dict
        """

        def recursion(params, base=[]):
            pairs = []

            for key, value in params.items():
                new_base = base + [key]
                if hasattr(value, 'values'):
                    pairs += recursion(value, new_base)
                else:
                    new_pair = None
                    if len(new_base) > 1:
                        first = urllib.quote(new_base.pop(0))
                        rest = map(lambda x: urllib.quote(x), new_base)
                        new_pair = "%s[%s]=%s" % (first, ']['.join(rest), urllib.quote(unicode(value)))
                    else:
                        new_pair = "%s=%s" % (urllib.quote(unicode(key)), urllib.quote(unicode(value)))
                    pairs.append(new_pair)
            return pairs

        return '&'.join(recursion(params))

    def _perform_request(self, shortener_api_url, method='GET', params=None):
        """
        Perform expected http request using urllib2
        :param shortener_api_url: str
        :param method: str
        :param params: dict
        :return: str
        """
        if 'GET' == method and isinstance(params, dict) and len(params) > 0:
            shortener_api_url += '?' + self._build_query_for_server_recursively(params)

        request = urllib2.Request('http://' + shortener_api_url)

        if 'POST' == method and isinstance(params, dict) and len(params) > 0:
            request.add_data(self._build_query_for_server_recursively(params))

        # print 'Request method after data :', request.get_method()
        request.add_header('User-agent', self.surly_api_ua)

        response = urllib2.urlopen(request, timeout=self.timeout)
        response_actual_string = response.read()
        response.close()

        return response_actual_string.strip()

    def _call_shortener_service(self, urls):
        """
        Call shortener service remotely for urls encoding
        :param urls: list
        :return: dict
        """
        if len(urls) == 0:
            return {'urls': [], 'errors': []}

        shortener_api_url = self.api_host + self.api_path

        params = {
            'raw': 1,
            'urls': "\r\n".join(urls)
        }

        json = self._perform_request(shortener_api_url, 'POST', params)

        if json:
            try:
                result = json_lib.loads(json)
            except ValueError as error:
                result = {'urls': {}, 'errors': [error.message]}
        else:
            result = {'urls': {}, 'errors': ['Connection error']}

        return result

    def _shorten_remotely(self, urls):
        """
        Shrten urls remotely
        :param urls: list
        :return: dict
        """
        if not isinstance(urls, list) or len(urls) == 0:
            return {}

        urls = list(set(urls))

        return self._call_shortener_service(urls)['urls']

    def _shorten(self, urls):
        """
        Shorten urls remotely and add them to cache
        :param urls: list
        """
        # remote those that are not in object cache
        filtered_urls = []
        for url in urls:
            normalized_url = self._normalize_url(url)
            if normalized_url and normalized_url not in self.shortener_cache:
                filtered_urls.append(normalized_url)
        urls = filtered_urls

        # get from the local store
        self.shortener_cache = dict(chain(self.shortener_cache.items(), self._get_cached_short_ids(urls).items()))

        # get from the remote store and put to the local store
        urls = [url for url in urls if url not in self.shortener_cache]
        remote_short_ids = self._shorten_remotely(urls)

        self.shortener_cache = dict(chain(self.shortener_cache.items(), remote_short_ids.items()))

        self._cache_short_ids(remote_short_ids)

    def _cache_short_ids(self, remote_short_ids):
        """
        Caches shortlink identifiers taken from Sur.ly shortener service into the local store.

        This method accepts an associative array containing a key-value pairs of target URLs
        and their corresponding IDs to be stored in cache (either memcached or any RDBMS).
        :param remote_short_ids: list
        """
        pass

    def _smart_url_encode(self, url):
        """
        Smart urlencode. We need to remove all redundant data from url like "www." or
         backslash from the end of domain
        :param url: str
        :return: str
        """
        matches = self.regex_smart_url_encode.match(url)
        if matches is not None:
            regex_matched_groups = dict(tuple(enumerate(matches.groups())))
            del regex_matched_groups[0]
            if regex_matched_groups[2]:
                del regex_matched_groups[1]
            if regex_matched_groups[3] == '':
                regex_matched_groups[2] = regex_matched_groups[2].rstrip('/')
            regex_matched_groups[3] = urllib.quote(regex_matched_groups[3], safe='')
            return ''.join(regex_matched_groups.itervalues())
        else:
            return urllib.quote(url, safe='')