
0c@_ö  ã               @   sB   d  d l  Z  d  d l Z d d l m Z Gd d „  d e ƒ Z d S)é    Né   )ÚProbingStatec               @   s£   e  Z d  Z d Z d d d „ Z d d „  Z e d d „  ƒ Z d	 d
 „  Z e d d „  ƒ Z	 d d „  Z
 e d d „  ƒ Z e d d „  ƒ Z e d d „  ƒ Z d S)ÚCharSetProbergffffffî?Nc             C   s(   d  |  _  | |  _ t j t ƒ |  _ d  S)N)Ú_stateÚlang_filterÚloggingÚ	getLoggerÚ__name__Úlogger)Úselfr   © r   ú@/tmp/pip-build-jynh7p1z/pip/pip/_vendor/chardet/charsetprober.pyÚ__init__'   s    		zCharSetProber.__init__c             C   s   t  j |  _ d  S)N)r   Z	DETECTINGr   )r   r   r   r   Úreset,   s    zCharSetProber.resetc             C   s   d  S)Nr   )r   r   r   r   Úcharset_name/   s    zCharSetProber.charset_namec             C   s   d  S)Nr   )r   Úbufr   r   r   Úfeed3   s    zCharSetProber.feedc             C   s   |  j  S)N)r   )r   r   r   r   Ústate6   s    zCharSetProber.statec             C   s   d S)Ng        r   )r   r   r   r   Úget_confidence:   s    zCharSetProber.get_confidencec             C   s   t  j d d |  ƒ }  |  S)Ns   ([ -])+ó    )ÚreÚsub)r   r   r   r   Úfilter_high_byte_only=   s    z#CharSetProber.filter_high_byte_onlyc             C   sƒ   t  ƒ  } t j d |  ƒ } xa | D]Y } | j | d d … ƒ | d d … } | j ƒ  rn | d k  rn d } | j | ƒ q" W| S)u9  
        We define three types of bytes:
        alphabet: english alphabets [a-zA-Z]
        international: international characters [Â€-Ã¿]
        marker: everything else [^a-zA-ZÂ€-Ã¿]

        The input buffer can be thought to contain a series of words delimited
        by markers. This function works to filter all words that contain at
        least one international character. All contiguous sequences of markers
        are replaced by a single space ascii character.

        This filter applies to all scripts which do not use English characters.
        s%   [a-zA-Z]*[€-ÿ]+[a-zA-Z]*[^a-zA-Z€-ÿ]?Nr   s   €r   éÿÿÿÿr   )Ú	bytearrayr   ÚfindallÚextendÚisalpha)r   ÚfilteredÚwordsÚwordZ	last_charr   r   r   Úfilter_international_wordsB   s    			z(CharSetProber.filter_international_wordsc             C   sè   t  ƒ  } d } d } x¯ t t |  ƒ ƒ D]› } |  | | d … } | d k rW d } n | d k ri d } | d k  r( | j ƒ  r( | | k r¹ | r¹ | j |  | | … ƒ | j d ƒ | d } q( W| sä | j |  | d	 … ƒ | S)
aÈ  
        Returns a copy of ``buf`` that retains only the sequences of English
        alphabet and high byte characters that are not between <> characters.
        Also retains English alphabet and high byte characters immediately
        before occurrences of >.

        This filter can be applied to all scripts which contain both English
        characters and extended ASCII characters, but is currently only used by
        ``Latin1Prober``.
        Fr   r   ó   >ó   <Ts   €r   N)r   ÚrangeÚlenr   r   )r   r   Zin_tagÚprevÚcurrZbuf_charr   r   r   Úfilter_with_english_lettersg   s"    		z)CharSetProber.filter_with_english_letters)r	   Ú
__module__Ú__qualname__ZSHORTCUT_THRESHOLDr   r   Úpropertyr   r   r   r   Ústaticmethodr   r!   r(   r   r   r   r   r   #   s   %r   )r   r   Zenumsr   Úobjectr   r   r   r   r   Ú<module>   s   