Add support for all type of string with python 2 and python 3

2026-06-02 02:26:07 +00:00 · 2016-01-28 01:36:59 +01:00
parent 04839ac9ec
commit e24a4b05c1
44 changed files with 108 additions and 133 deletions
@@ -4,7 +4,7 @@ History
 2.1.0 (unreleased)
 ------------------

- Nothing changed yet.
+- Add support for any type of string with python 2 and python 3 (binary, str, unicode).


 2.0.0 (2016-01-27)
@@ -129,15 +129,13 @@ GuessIt can be used from command line::
 It can also be used as a python module::

    >>> from guessit import guessit
-    >>> guessit(u'Treme.1x03.Right.Place,.Wrong.Time.HDTV.XviD-NoTV.avi')  # doctest: +ALLOW_UNICODE
+    >>> guessit('Treme.1x03.Right.Place,.Wrong.Time.HDTV.XviD-NoTV.avi')
    MatchesDict([('title', 'Treme'), ('season', 1), ('episode', 3), ('episode_title', 'Right Place, Wrong Time'), ('format', 'HDTV'), ('video_codec', 'XviD'), ('release_group', 'NoTV'), ('container', 'avi'), ('mimetype', 'video/x-msvideo'), ('type', 'episode')])

 ``MatchesDict`` is a dict that keeps matches ordering.

 Command line options can be given as dict or string to the second argument.

-GuessIt only accept unicode string, so you need to use ``u`` prefix for input string on python 2.
-
 Docker
 ------

@@ -13,15 +13,13 @@ API
 Example::

    >>> from guessit import guessit
-    >>> guessit(u'Treme.1x03.Right.Place,.Wrong.Time.HDTV.XviD-NoTV.avi')  # doctest: +ALLOW_UNICODE
+    >>> guessit('Treme.1x03.Right.Place,.Wrong.Time.HDTV.XviD-NoTV.avi')
    MatchesDict([('title', 'Treme'), ('season', 1), ('episode', 3), ('episode_title', 'Right Place, Wrong Time'), ('format', 'HDTV'), ('video_codec', 'XviD'), ('release_group', 'NoTV'), ('container', 'avi'), ('mimetype', 'video/x-msvideo'), ('type', 'episode')])

 ``MatchesDict`` is a dict that keeps matches ordering.

 Command line options can be given as dict or string to the second argument.

-GuessIt only accept unicode string, so you need to use ``u`` prefix for input string on python 2.
-
 Properties
 ----------
 Some properties have been renamed.
@@ -4,13 +4,12 @@
 Entry point module
 """
 # pragma: no cover
-from __future__ import print_function, unicode_literals
+from __future__ import print_function

 import os
 import logging
 import json
 import sys
-from io import open  #pylint:disable=redefined-builtin

 import six
 from guessit.jsonutils import GuessitEncoder
@@ -132,12 +131,12 @@ def main(args=None):  # pylint:disable=too-many-branches
    filenames = []
    if options.filename:
        for filename in options.filename:
-            if not isinstance(filename, six.text_type):  # pragma: no cover
-                encoding = sys.getfilesystemencoding()
-                filename = filename.decode(encoding)
            filenames.append(filename)
    if options.input_file:
-        input_file = open(options.input_file, 'r', encoding='utf-8')
+        if six.PY2:
+            input_file = open(options.input_file, 'r')
+        else:
+            input_file = open(options.input_file, 'r', encoding='utf-8')
        try:
            filenames.extend([line.strip() for line in input_file.readlines()])
        finally:
@@ -3,7 +3,6 @@
 """
 API functions that can be used by external software
 """
-from __future__ import unicode_literals
 try:
    from collections import OrderedDict
 except ImportError:  # pragma: no-cover
@@ -65,11 +64,25 @@ class GuessItApi(object):
        :return:
        :rtype:
        """
-        if not isinstance(string, six.text_type):
-            raise TypeError("guessit input must be %s." % six.text_type.__name__)
        options = parse_options(options)
-        return self.rebulk.matches(string, options).to_dict(options.get('advanced', False),
-                                                            options.get('implicit', False))
+        result_decode = False
+        result_encode = False
+        if six.PY2 and isinstance(string, six.text_type):
+            string = string.encode("latin-1")
+            result_decode = True
+        if six.PY3 and isinstance(string, six.binary_type):
+            string = string.decode('ascii')
+            result_encode = True
+        matches = self.rebulk.matches(string, options)
+        if result_decode:
+            for match in matches:
+                if isinstance(match.value, six.binary_type):
+                    match.value = match.value.decode("latin-1")
+        if result_encode:
+            for match in matches:
+                if isinstance(match.value, six.text_type):
+                    match.value = match.value.encode("ascii")
+        return matches.to_dict(options.get('advanced', False), options.get('implicit', False))

    def properties(self, options=None):
        """
@@ -27,6 +27,6 @@ class GuessitEncoder(json.JSONEncoder):
            ret['end'] = o.end
            return ret
        elif hasattr(o, 'name'):  # Babelfish languages/countries long name
-            return o.name
+            return str(o.name)
        else:  # pragma: no cover
            return str(o)
@@ -3,8 +3,6 @@
 """
 Options
 """
-from __future__ import unicode_literals
-
 import sys
 from argparse import ArgumentParser
 import shlex
@@ -3,8 +3,6 @@
 """
 Rebulk object default builder
 """
-from __future__ import unicode_literals
-
 from rebulk import Rebulk

 from .markers.path import path
@@ -3,8 +3,6 @@
 """
 Common module
 """
-from __future__ import unicode_literals
-
 seps = r' [](){}+*|=§-_~#/\.,;:'  # list of tags/words separators

 title_seps = r'-+/\|'  # separators for title
@@ -3,8 +3,6 @@
 """
 Comparators
 """
-from __future__ import unicode_literals
-
 try:
    from functools import cmp_to_key
 except ImportError:
@@ -3,8 +3,6 @@
 """
 Date
 """
-from __future__ import unicode_literals
-
 from dateutil import parser

 from rebulk.remodule import re
@@ -3,8 +3,6 @@
 """
 Formatters
 """
-from __future__ import unicode_literals
-
 from rebulk.remodule import re

 from rebulk.formatters import formatters
@@ -3,8 +3,6 @@
 """
 parse numeral from various formats
 """
-from __future__ import unicode_literals
-
 from rebulk.remodule import re

 digital_numeral = r'\d{1,4}'
@@ -3,8 +3,6 @@
 """
 Validators
 """
-from __future__ import unicode_literals
-
 from functools import partial

 from rebulk.validators import chars_before, chars_after, chars_surround
@@ -3,11 +3,11 @@
 """
 Words utils
 """
-from __future__ import unicode_literals
+from collections import namedtuple

-from rebulk.remodule import re
+from guessit.rules.common import seps

-_words_rexp = re.compile(r'\w+', re.UNICODE)
+_Word = namedtuple('_Word', ['span', 'value'])


 def iter_words(string):
@@ -18,7 +18,21 @@ def iter_words(string):
    :return:
    :rtype: iterable[str]
    """
-    return _words_rexp.finditer(string.replace('_', ' '))
+    i = 0
+    last_sep_index = -1
+    inside_word = False
+    for char in string:
+        if ord(char) < 128 and char in seps:  # Make sure we don't exclude unicode characters.
+            if inside_word:
+                yield _Word(span=(last_sep_index+1, i), value=string[last_sep_index+1:i])
+            inside_word = False
+            last_sep_index = i
+        else:
+            inside_word = True
+        i += 1
+    if inside_word:
+        yield _Word(span=(last_sep_index+1, i), value=string[last_sep_index+1:i])
+

 # list of common words which could be interpreted as properties, but which
 # are far too common to be able to say they represent a property in the
@@ -3,8 +3,6 @@
 """
 Groups markers (...), [...] and {...}
 """
-from __future__ import unicode_literals
-
 from rebulk import Rebulk


@@ -3,8 +3,6 @@
 """
 Path markers
 """
-from __future__ import unicode_literals
-
 from rebulk import Rebulk

 from rebulk.utils import find_all
@@ -3,8 +3,6 @@
 """
 Processors
 """
-from __future__ import unicode_literals
-
 from collections import defaultdict
 import copy

@@ -155,10 +153,11 @@ def _count_title_words(value):
    """
    ret = 0
    for word in iter_words(value):
-        if word.group(0).istitle():
+        if word.value.istitle():
            ret += 1
    return ret

+
 class SeasonYear(Rule):
    """
    If a season is a valid year and no year was found, create an match with year.
@@ -3,8 +3,6 @@
 """
 audio_codec, audio_profile and audio_channels property
 """
-from __future__ import unicode_literals
-
 from rebulk.remodule import re

 from rebulk import Rebulk, Rule, RemoveMatch
@@ -3,8 +3,6 @@
 """
 bonus property
 """
-from __future__ import unicode_literals
-
 from rebulk.remodule import re

 from rebulk import Rebulk, AppendMatch, Rule
@@ -3,8 +3,6 @@
 """
 cd and cd_count properties
 """
-from __future__ import unicode_literals
-
 from rebulk.remodule import re

 from rebulk import Rebulk
@@ -3,8 +3,6 @@
 """
 container property
 """
-from __future__ import unicode_literals
-
 from rebulk.remodule import re, REGEX_AVAILABLE

 from rebulk import Rebulk
@@ -4,8 +4,6 @@
 country property
 """
 # pylint: disable=no-member
-from __future__ import unicode_literals
-
 import babelfish

 from rebulk import Rebulk
@@ -99,10 +97,11 @@ def find_countries(string, context=None):
    """
    ret = []
    for word_match in iter_words(string.strip().lower()):
+        word = word_match.value
        try:
-            country_object = babelfish.Country.fromguessit(word_match.group())
+            country_object = babelfish.Country.fromguessit(word)
            if is_valid_country(country_object, context):
-                ret.append((word_match.start(), word_match.end(), {'value': country_object}))
+                ret.append((word_match.span[0], word_match.span[1], {'value': country_object}))
        except babelfish.Error:
            continue
    return ret
@@ -3,8 +3,6 @@
 """
 crc and uuid properties
 """
-from __future__ import unicode_literals
-
 from rebulk.remodule import re

 from rebulk import Rebulk
@@ -3,8 +3,6 @@
 """
 date and year properties
 """
-from __future__ import unicode_literals
-
 from rebulk import Rebulk, RemoveMatch, Rule

 from ..common.date import search_date, valid_year
@@ -3,8 +3,6 @@
 """
 edition property
 """
-from __future__ import unicode_literals
-
 from rebulk.remodule import re

 from rebulk import Rebulk
@@ -3,8 +3,6 @@
 """
 Episode title
 """
-from __future__ import unicode_literals
-
 from collections import defaultdict

 from rebulk import Rebulk, Rule, AppendMatch, RenameMatch
@@ -3,8 +3,6 @@
 """
 episode, season, episode_count, season_count and episode_details properties
 """
-from __future__ import unicode_literals
-
 import copy
 from collections import defaultdict

@@ -3,8 +3,6 @@
 """
 film property
 """
-from __future__ import unicode_literals
-
 from rebulk.remodule import re

 from rebulk import Rebulk, AppendMatch, Rule
@@ -3,8 +3,6 @@
 """
 format property
 """
-from __future__ import unicode_literals
-
 from rebulk.remodule import re

 from rebulk import Rebulk, RemoveMatch, Rule
@@ -4,8 +4,6 @@
 language and subtitle_language properties
 """
 # pylint: disable=no-member
-from __future__ import unicode_literals
-
 import copy

 import babelfish
@@ -125,8 +123,8 @@ def find_languages(string, context=None):

    matches = []
    for word_match in iter_words(string):
-        word = word_match.group()
-        start, end = word_match.span()
+        word = word_match.value
+        start, end = word_match.span

        lang_word = word.lower()
        key = 'language'
@@ -3,8 +3,6 @@
 """
 mimetype property
 """
-from __future__ import unicode_literals
-
 import mimetypes

 from rebulk import Rebulk, CustomRule, POST_PROCESS
@@ -3,8 +3,6 @@
 """
 other property
 """
-from __future__ import unicode_literals
-
 import copy

 from rebulk.remodule import re
@@ -3,8 +3,6 @@
 """
 part property
 """
-from __future__ import unicode_literals
-
 from rebulk.remodule import re, REGEX_AVAILABLE

 from rebulk import Rebulk
@@ -3,8 +3,6 @@
 """
 release_group property
 """
-from __future__ import unicode_literals
-
 import copy

 from rebulk.remodule import re
@@ -3,8 +3,6 @@
 """
 screen_size property
 """
-from __future__ import unicode_literals
-
 from rebulk.remodule import re

 from rebulk import Rebulk, Rule, RemoveMatch
@@ -3,8 +3,6 @@
 """
 title property
 """
-from __future__ import unicode_literals
-
 import re

 from rebulk import Rebulk, Rule, AppendMatch, RemoveMatch, AppendTags
@@ -3,8 +3,6 @@
 """
 type property
 """
-from __future__ import unicode_literals
-
 from rebulk import CustomRule, Rebulk, POST_PROCESS
 from rebulk.match import Match

@@ -3,8 +3,6 @@
 """
 video_codec and video_profile property
 """
-from __future__ import unicode_literals
-
 from rebulk.remodule import re

 from rebulk import Rebulk, Rule, RemoveMatch
@@ -3,8 +3,6 @@
 """
 Website property.
 """
-from __future__ import unicode_literals
-
 from pkg_resources import resource_stream  # @UnresolvedImport
 from rebulk.remodule import re, REGEX_AVAILABLE

@@ -3,8 +3,7 @@
 # pylint: disable=no-self-use, pointless-statement, missing-docstring, invalid-name

 import os
-
-import pytest
+import six

 from ..api import guessit, properties

@@ -12,20 +11,25 @@ __location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file


 def test_default():
-    ret = guessit(u'Fear.and.Loathing.in.Las.Vegas.FRENCH.ENGLISH.720p.HDDVD.DTS.x264-ESiR.mkv')
+    ret = guessit('Fear.and.Loathing.in.Las.Vegas.FRENCH.ENGLISH.720p.HDDVD.DTS.x264-ESiR.mkv')
    assert ret and 'title' in ret


+def test_forced_unicode():
+    ret = guessit(u'Fear.and.Loathing.in.Las.Vegas.FRENCH.ENGLISH.720p.HDDVD.DTS.x264-ESiR.mkv')
+    assert ret and 'title' in ret and isinstance(ret['title'], six.text_type)
+
+
+def test_forced_binary():
+    ret = guessit(b'Fear.and.Loathing.in.Las.Vegas.FRENCH.ENGLISH.720p.HDDVD.DTS.x264-ESiR.mkv')
+    assert ret and 'title' in ret and isinstance(ret['title'], six.binary_type)
+
+
 def test_unicode():
-    ret = guessit(u'[阿维达].Avida.2006.FRENCH.DVDRiP.XViD-PROD.avi')
+    ret = guessit('[阿维达].Avida.2006.FRENCH.DVDRiP.XViD-PROD.avi')
    assert ret and 'title' in ret


-def test_main_non_unicode():
-    with pytest.raises(TypeError):
-        guessit(b'Fear.and.Loathing.in.Las.Vegas.FRENCH.ENGLISH.720p.HDDVD.DTS.x264-ESiR.mkv')
-
-
 def test_properties():
    props = properties()
    assert 'video_codec' in props.keys()
@@ -9,19 +9,19 @@ from ..api import guessit


 def case1():
-    return guessit(u'Fear.and.Loathing.in.Las.Vegas.FRENCH.ENGLISH.720p.HDDVD.DTS.x264-ESiR.mkv')
+    return guessit('Fear.and.Loathing.in.Las.Vegas.FRENCH.ENGLISH.720p.HDDVD.DTS.x264-ESiR.mkv')


 def case2():
-    return guessit(u'Movies/Fantastic Mr Fox/Fantastic.Mr.Fox.2009.DVDRip.{x264+LC-AAC.5.1}{Fr-Eng}{Sub.Fr-Eng}-™.[sharethefiles.com].mkv')
+    return guessit('Movies/Fantastic Mr Fox/Fantastic.Mr.Fox.2009.DVDRip.{x264+LC-AAC.5.1}{Fr-Eng}{Sub.Fr-Eng}-™.[sharethefiles.com].mkv')


 def case3():
-    return guessit(u'Series/dexter/Dexter.5x02.Hello,.Bandit.ENG.-.sub.FR.HDTV.XviD-AlFleNi-TeaM.[tvu.org.ru].avi')
+    return guessit('Series/dexter/Dexter.5x02.Hello,.Bandit.ENG.-.sub.FR.HDTV.XviD-AlFleNi-TeaM.[tvu.org.ru].avi')


 def case4():
-    return guessit(u'Movies/The Doors (1991)/09.03.08.The.Doors.(1991).BDRip.720p.AC3.X264-HiS@SiLUHD-English.[sharethefiles.com].mkv')
+    return guessit('Movies/The Doors (1991)/09.03.08.The.Doors.(1991).BDRip.720p.AC3.X264-HiS@SiLUHD-English.[sharethefiles.com].mkv')


@pytest.mark.benchmark(
@@ -16,35 +16,35 @@ def test_main_no_args():


 def test_main():
-    main([u'Fear.and.Loathing.in.Las.Vegas.FRENCH.ENGLISH.720p.HDDVD.DTS.x264-ESiR.mkv'])
-
-
-def test_main_unicode():
-    main([u'[阿维达].Avida.2006.FRENCH.DVDRiP.XViD-PROD.avi'])
-
-
-def test_main_non_unicode():
    main(['Fear.and.Loathing.in.Las.Vegas.FRENCH.ENGLISH.720p.HDDVD.DTS.x264-ESiR.mkv'])


+def test_main_unicode():
+    main(['[阿维达].Avida.2006.FRENCH.DVDRiP.XViD-PROD.avi'])
+
+
+def test_main_forced_unicode():
+    main([u'Fear.and.Loathing.in.Las.Vegas.FRENCH.ENGLISH.720p.HDDVD.DTS.x264-ESiR.mkv'])
+
+
 def test_main_verbose():
-    main([u'Fear.and.Loathing.in.Las.Vegas.FRENCH.ENGLISH.720p.HDDVD.DTS.x264-ESiR.mkv', '--verbose'])
+    main(['Fear.and.Loathing.in.Las.Vegas.FRENCH.ENGLISH.720p.HDDVD.DTS.x264-ESiR.mkv', '--verbose'])


 def test_main_yaml():
-    main([u'Fear.and.Loathing.in.Las.Vegas.FRENCH.ENGLISH.720p.HDDVD.DTS.x264-ESiR.mkv', '--yaml'])
+    main(['Fear.and.Loathing.in.Las.Vegas.FRENCH.ENGLISH.720p.HDDVD.DTS.x264-ESiR.mkv', '--yaml'])


 def test_main_json():
-    main([u'Fear.and.Loathing.in.Las.Vegas.FRENCH.ENGLISH.720p.HDDVD.DTS.x264-ESiR.mkv', '--json'])
+    main(['Fear.and.Loathing.in.Las.Vegas.FRENCH.ENGLISH.720p.HDDVD.DTS.x264-ESiR.mkv', '--json'])


 def test_main_show_property():
-    main([u'Fear.and.Loathing.in.Las.Vegas.FRENCH.ENGLISH.720p.HDDVD.DTS.x264-ESiR.mkv', '-P', 'title'])
+    main(['Fear.and.Loathing.in.Las.Vegas.FRENCH.ENGLISH.720p.HDDVD.DTS.x264-ESiR.mkv', '-P', 'title'])


 def test_main_advanced():
-    main([u'Fear.and.Loathing.in.Las.Vegas.FRENCH.ENGLISH.720p.HDDVD.DTS.x264-ESiR.mkv', '-a'])
+    main(['Fear.and.Loathing.in.Las.Vegas.FRENCH.ENGLISH.720p.HDDVD.DTS.x264-ESiR.mkv', '-a'])


 def test_main_input():
@@ -169,21 +169,34 @@ class TestYml(object):

        for string, expected in data.items():
            TestYml.set_default(expected, default)
-            if not isinstance(string, six.text_type):
-                string = six.text_type(string)
-            if not string_predicate or string_predicate(string):  # pylint: disable=not-callable
-                entry = self.check(string, expected)
-                if entry.ok:
-                    logger.debug(u'[' + filename + '] ' + six.text_type(entry))
-                elif entry.warning:
-                    logger.warning(u'[' + filename + '] ' + six.text_type(entry))
-                elif entry.error:
-                    logger.error(u'[' + filename + '] ' + six.text_type(entry))
-                    for line in entry.details:
-                        logger.error(u'[' + filename + '] ' + ' ' * 4 + line)
-                entries.append(entry)
+            entry = self.check_data(filename, string, expected)
+            entries.append(entry)
        entries.assert_ok()

+    def check_data(self, filename, string, expected):
+        if six.PY2 and isinstance(string, six.text_type):
+            string = string.encode('utf-8')
+            converts = []
+            for k, v in expected.items():
+                if isinstance(v, six.text_type):
+                    v = v.encode('utf-8')
+                    converts.append((k, v))
+            for k, v in converts:
+                expected[k] = v
+        if not isinstance(string, str):
+            string = str(string)
+        if not string_predicate or string_predicate(string):  # pylint: disable=not-callable
+            entry = self.check(string, expected)
+            if entry.ok:
+                logger.debug('[' + filename + '] ' + str(entry))
+            elif entry.warning:
+                logger.warning('[' + filename + '] ' + str(entry))
+            elif entry.error:
+                logger.error('[' + filename + '] ' + str(entry))
+                for line in entry.details:
+                    logger.error('[' + filename + '] ' + ' ' * 4 + line)
+        return entry
+
    def check(self, string, expected):
        negates, global_, string = self.parse_token_options(string)