Add support for all type of string with python 2 and python 3

This commit is contained in:
Toilal
2016-01-28 01:36:59 +01:00
parent 04839ac9ec
commit e24a4b05c1
44 changed files with 108 additions and 133 deletions
+1 -1
View File
@@ -4,7 +4,7 @@ History
2.1.0 (unreleased)
------------------
- Nothing changed yet.
- Add support for any type of string with python 2 and python 3 (binary, str, unicode).
2.0.0 (2016-01-27)
+1 -3
View File
@@ -129,15 +129,13 @@ GuessIt can be used from command line::
It can also be used as a python module::
>>> from guessit import guessit
>>> guessit(u'Treme.1x03.Right.Place,.Wrong.Time.HDTV.XviD-NoTV.avi') # doctest: +ALLOW_UNICODE
>>> guessit('Treme.1x03.Right.Place,.Wrong.Time.HDTV.XviD-NoTV.avi')
MatchesDict([('title', 'Treme'), ('season', 1), ('episode', 3), ('episode_title', 'Right Place, Wrong Time'), ('format', 'HDTV'), ('video_codec', 'XviD'), ('release_group', 'NoTV'), ('container', 'avi'), ('mimetype', 'video/x-msvideo'), ('type', 'episode')])
``MatchesDict`` is a dict that keeps matches ordering.
Command line options can be given as dict or string to the second argument.
GuessIt only accept unicode string, so you need to use ``u`` prefix for input string on python 2.
Docker
------
+1 -3
View File
@@ -13,15 +13,13 @@ API
Example::
>>> from guessit import guessit
>>> guessit(u'Treme.1x03.Right.Place,.Wrong.Time.HDTV.XviD-NoTV.avi') # doctest: +ALLOW_UNICODE
>>> guessit('Treme.1x03.Right.Place,.Wrong.Time.HDTV.XviD-NoTV.avi')
MatchesDict([('title', 'Treme'), ('season', 1), ('episode', 3), ('episode_title', 'Right Place, Wrong Time'), ('format', 'HDTV'), ('video_codec', 'XviD'), ('release_group', 'NoTV'), ('container', 'avi'), ('mimetype', 'video/x-msvideo'), ('type', 'episode')])
``MatchesDict`` is a dict that keeps matches ordering.
Command line options can be given as dict or string to the second argument.
GuessIt only accept unicode string, so you need to use ``u`` prefix for input string on python 2.
Properties
----------
Some properties have been renamed.
+5 -6
View File
@@ -4,13 +4,12 @@
Entry point module
"""
# pragma: no cover
from __future__ import print_function, unicode_literals
from __future__ import print_function
import os
import logging
import json
import sys
from io import open #pylint:disable=redefined-builtin
import six
from guessit.jsonutils import GuessitEncoder
@@ -132,12 +131,12 @@ def main(args=None): # pylint:disable=too-many-branches
filenames = []
if options.filename:
for filename in options.filename:
if not isinstance(filename, six.text_type): # pragma: no cover
encoding = sys.getfilesystemencoding()
filename = filename.decode(encoding)
filenames.append(filename)
if options.input_file:
input_file = open(options.input_file, 'r', encoding='utf-8')
if six.PY2:
input_file = open(options.input_file, 'r')
else:
input_file = open(options.input_file, 'r', encoding='utf-8')
try:
filenames.extend([line.strip() for line in input_file.readlines()])
finally:
+18 -5
View File
@@ -3,7 +3,6 @@
"""
API functions that can be used by external software
"""
from __future__ import unicode_literals
try:
from collections import OrderedDict
except ImportError: # pragma: no-cover
@@ -65,11 +64,25 @@ class GuessItApi(object):
:return:
:rtype:
"""
if not isinstance(string, six.text_type):
raise TypeError("guessit input must be %s." % six.text_type.__name__)
options = parse_options(options)
return self.rebulk.matches(string, options).to_dict(options.get('advanced', False),
options.get('implicit', False))
result_decode = False
result_encode = False
if six.PY2 and isinstance(string, six.text_type):
string = string.encode("latin-1")
result_decode = True
if six.PY3 and isinstance(string, six.binary_type):
string = string.decode('ascii')
result_encode = True
matches = self.rebulk.matches(string, options)
if result_decode:
for match in matches:
if isinstance(match.value, six.binary_type):
match.value = match.value.decode("latin-1")
if result_encode:
for match in matches:
if isinstance(match.value, six.text_type):
match.value = match.value.encode("ascii")
return matches.to_dict(options.get('advanced', False), options.get('implicit', False))
def properties(self, options=None):
"""
+1 -1
View File
@@ -27,6 +27,6 @@ class GuessitEncoder(json.JSONEncoder):
ret['end'] = o.end
return ret
elif hasattr(o, 'name'): # Babelfish languages/countries long name
return o.name
return str(o.name)
else: # pragma: no cover
return str(o)
-2
View File
@@ -3,8 +3,6 @@
"""
Options
"""
from __future__ import unicode_literals
import sys
from argparse import ArgumentParser
import shlex
-2
View File
@@ -3,8 +3,6 @@
"""
Rebulk object default builder
"""
from __future__ import unicode_literals
from rebulk import Rebulk
from .markers.path import path
-2
View File
@@ -3,8 +3,6 @@
"""
Common module
"""
from __future__ import unicode_literals
seps = r' [](){}+*|=§-_~#/\.,;:' # list of tags/words separators
title_seps = r'-+/\|' # separators for title
-2
View File
@@ -3,8 +3,6 @@
"""
Comparators
"""
from __future__ import unicode_literals
try:
from functools import cmp_to_key
except ImportError:
-2
View File
@@ -3,8 +3,6 @@
"""
Date
"""
from __future__ import unicode_literals
from dateutil import parser
from rebulk.remodule import re
-2
View File
@@ -3,8 +3,6 @@
"""
Formatters
"""
from __future__ import unicode_literals
from rebulk.remodule import re
from rebulk.formatters import formatters
-2
View File
@@ -3,8 +3,6 @@
"""
parse numeral from various formats
"""
from __future__ import unicode_literals
from rebulk.remodule import re
digital_numeral = r'\d{1,4}'
-2
View File
@@ -3,8 +3,6 @@
"""
Validators
"""
from __future__ import unicode_literals
from functools import partial
from rebulk.validators import chars_before, chars_after, chars_surround
+18 -4
View File
@@ -3,11 +3,11 @@
"""
Words utils
"""
from __future__ import unicode_literals
from collections import namedtuple
from rebulk.remodule import re
from guessit.rules.common import seps
_words_rexp = re.compile(r'\w+', re.UNICODE)
_Word = namedtuple('_Word', ['span', 'value'])
def iter_words(string):
@@ -18,7 +18,21 @@ def iter_words(string):
:return:
:rtype: iterable[str]
"""
return _words_rexp.finditer(string.replace('_', ' '))
i = 0
last_sep_index = -1
inside_word = False
for char in string:
if ord(char) < 128 and char in seps: # Make sure we don't exclude unicode characters.
if inside_word:
yield _Word(span=(last_sep_index+1, i), value=string[last_sep_index+1:i])
inside_word = False
last_sep_index = i
else:
inside_word = True
i += 1
if inside_word:
yield _Word(span=(last_sep_index+1, i), value=string[last_sep_index+1:i])
# list of common words which could be interpreted as properties, but which
# are far too common to be able to say they represent a property in the
-2
View File
@@ -3,8 +3,6 @@
"""
Groups markers (...), [...] and {...}
"""
from __future__ import unicode_literals
from rebulk import Rebulk
-2
View File
@@ -3,8 +3,6 @@
"""
Path markers
"""
from __future__ import unicode_literals
from rebulk import Rebulk
from rebulk.utils import find_all
+2 -3
View File
@@ -3,8 +3,6 @@
"""
Processors
"""
from __future__ import unicode_literals
from collections import defaultdict
import copy
@@ -155,10 +153,11 @@ def _count_title_words(value):
"""
ret = 0
for word in iter_words(value):
if word.group(0).istitle():
if word.value.istitle():
ret += 1
return ret
class SeasonYear(Rule):
"""
If a season is a valid year and no year was found, create an match with year.
-2
View File
@@ -3,8 +3,6 @@
"""
audio_codec, audio_profile and audio_channels property
"""
from __future__ import unicode_literals
from rebulk.remodule import re
from rebulk import Rebulk, Rule, RemoveMatch
-2
View File
@@ -3,8 +3,6 @@
"""
bonus property
"""
from __future__ import unicode_literals
from rebulk.remodule import re
from rebulk import Rebulk, AppendMatch, Rule
-2
View File
@@ -3,8 +3,6 @@
"""
cd and cd_count properties
"""
from __future__ import unicode_literals
from rebulk.remodule import re
from rebulk import Rebulk
-2
View File
@@ -3,8 +3,6 @@
"""
container property
"""
from __future__ import unicode_literals
from rebulk.remodule import re, REGEX_AVAILABLE
from rebulk import Rebulk
+3 -4
View File
@@ -4,8 +4,6 @@
country property
"""
# pylint: disable=no-member
from __future__ import unicode_literals
import babelfish
from rebulk import Rebulk
@@ -99,10 +97,11 @@ def find_countries(string, context=None):
"""
ret = []
for word_match in iter_words(string.strip().lower()):
word = word_match.value
try:
country_object = babelfish.Country.fromguessit(word_match.group())
country_object = babelfish.Country.fromguessit(word)
if is_valid_country(country_object, context):
ret.append((word_match.start(), word_match.end(), {'value': country_object}))
ret.append((word_match.span[0], word_match.span[1], {'value': country_object}))
except babelfish.Error:
continue
return ret
-2
View File
@@ -3,8 +3,6 @@
"""
crc and uuid properties
"""
from __future__ import unicode_literals
from rebulk.remodule import re
from rebulk import Rebulk
-2
View File
@@ -3,8 +3,6 @@
"""
date and year properties
"""
from __future__ import unicode_literals
from rebulk import Rebulk, RemoveMatch, Rule
from ..common.date import search_date, valid_year
-2
View File
@@ -3,8 +3,6 @@
"""
edition property
"""
from __future__ import unicode_literals
from rebulk.remodule import re
from rebulk import Rebulk
@@ -3,8 +3,6 @@
"""
Episode title
"""
from __future__ import unicode_literals
from collections import defaultdict
from rebulk import Rebulk, Rule, AppendMatch, RenameMatch
-2
View File
@@ -3,8 +3,6 @@
"""
episode, season, episode_count, season_count and episode_details properties
"""
from __future__ import unicode_literals
import copy
from collections import defaultdict
-2
View File
@@ -3,8 +3,6 @@
"""
film property
"""
from __future__ import unicode_literals
from rebulk.remodule import re
from rebulk import Rebulk, AppendMatch, Rule
-2
View File
@@ -3,8 +3,6 @@
"""
format property
"""
from __future__ import unicode_literals
from rebulk.remodule import re
from rebulk import Rebulk, RemoveMatch, Rule
+2 -4
View File
@@ -4,8 +4,6 @@
language and subtitle_language properties
"""
# pylint: disable=no-member
from __future__ import unicode_literals
import copy
import babelfish
@@ -125,8 +123,8 @@ def find_languages(string, context=None):
matches = []
for word_match in iter_words(string):
word = word_match.group()
start, end = word_match.span()
word = word_match.value
start, end = word_match.span
lang_word = word.lower()
key = 'language'
-2
View File
@@ -3,8 +3,6 @@
"""
mimetype property
"""
from __future__ import unicode_literals
import mimetypes
from rebulk import Rebulk, CustomRule, POST_PROCESS
-2
View File
@@ -3,8 +3,6 @@
"""
other property
"""
from __future__ import unicode_literals
import copy
from rebulk.remodule import re
-2
View File
@@ -3,8 +3,6 @@
"""
part property
"""
from __future__ import unicode_literals
from rebulk.remodule import re, REGEX_AVAILABLE
from rebulk import Rebulk
@@ -3,8 +3,6 @@
"""
release_group property
"""
from __future__ import unicode_literals
import copy
from rebulk.remodule import re
-2
View File
@@ -3,8 +3,6 @@
"""
screen_size property
"""
from __future__ import unicode_literals
from rebulk.remodule import re
from rebulk import Rebulk, Rule, RemoveMatch
-2
View File
@@ -3,8 +3,6 @@
"""
title property
"""
from __future__ import unicode_literals
import re
from rebulk import Rebulk, Rule, AppendMatch, RemoveMatch, AppendTags
-2
View File
@@ -3,8 +3,6 @@
"""
type property
"""
from __future__ import unicode_literals
from rebulk import CustomRule, Rebulk, POST_PROCESS
from rebulk.match import Match
-2
View File
@@ -3,8 +3,6 @@
"""
video_codec and video_profile property
"""
from __future__ import unicode_literals
from rebulk.remodule import re
from rebulk import Rebulk, Rule, RemoveMatch
-2
View File
@@ -3,8 +3,6 @@
"""
Website property.
"""
from __future__ import unicode_literals
from pkg_resources import resource_stream # @UnresolvedImport
from rebulk.remodule import re, REGEX_AVAILABLE
+13 -9
View File
@@ -3,8 +3,7 @@
# pylint: disable=no-self-use, pointless-statement, missing-docstring, invalid-name
import os
import pytest
import six
from ..api import guessit, properties
@@ -12,20 +11,25 @@ __location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file
def test_default():
ret = guessit(u'Fear.and.Loathing.in.Las.Vegas.FRENCH.ENGLISH.720p.HDDVD.DTS.x264-ESiR.mkv')
ret = guessit('Fear.and.Loathing.in.Las.Vegas.FRENCH.ENGLISH.720p.HDDVD.DTS.x264-ESiR.mkv')
assert ret and 'title' in ret
def test_forced_unicode():
ret = guessit(u'Fear.and.Loathing.in.Las.Vegas.FRENCH.ENGLISH.720p.HDDVD.DTS.x264-ESiR.mkv')
assert ret and 'title' in ret and isinstance(ret['title'], six.text_type)
def test_forced_binary():
ret = guessit(b'Fear.and.Loathing.in.Las.Vegas.FRENCH.ENGLISH.720p.HDDVD.DTS.x264-ESiR.mkv')
assert ret and 'title' in ret and isinstance(ret['title'], six.binary_type)
def test_unicode():
ret = guessit(u'[阿维达].Avida.2006.FRENCH.DVDRiP.XViD-PROD.avi')
ret = guessit('[阿维达].Avida.2006.FRENCH.DVDRiP.XViD-PROD.avi')
assert ret and 'title' in ret
def test_main_non_unicode():
with pytest.raises(TypeError):
guessit(b'Fear.and.Loathing.in.Las.Vegas.FRENCH.ENGLISH.720p.HDDVD.DTS.x264-ESiR.mkv')
def test_properties():
props = properties()
assert 'video_codec' in props.keys()
+4 -4
View File
@@ -9,19 +9,19 @@ from ..api import guessit
def case1():
return guessit(u'Fear.and.Loathing.in.Las.Vegas.FRENCH.ENGLISH.720p.HDDVD.DTS.x264-ESiR.mkv')
return guessit('Fear.and.Loathing.in.Las.Vegas.FRENCH.ENGLISH.720p.HDDVD.DTS.x264-ESiR.mkv')
def case2():
return guessit(u'Movies/Fantastic Mr Fox/Fantastic.Mr.Fox.2009.DVDRip.{x264+LC-AAC.5.1}{Fr-Eng}{Sub.Fr-Eng}-™.[sharethefiles.com].mkv')
return guessit('Movies/Fantastic Mr Fox/Fantastic.Mr.Fox.2009.DVDRip.{x264+LC-AAC.5.1}{Fr-Eng}{Sub.Fr-Eng}-™.[sharethefiles.com].mkv')
def case3():
return guessit(u'Series/dexter/Dexter.5x02.Hello,.Bandit.ENG.-.sub.FR.HDTV.XviD-AlFleNi-TeaM.[tvu.org.ru].avi')
return guessit('Series/dexter/Dexter.5x02.Hello,.Bandit.ENG.-.sub.FR.HDTV.XviD-AlFleNi-TeaM.[tvu.org.ru].avi')
def case4():
return guessit(u'Movies/The Doors (1991)/09.03.08.The.Doors.(1991).BDRip.720p.AC3.X264-HiS@SiLUHD-English.[sharethefiles.com].mkv')
return guessit('Movies/The Doors (1991)/09.03.08.The.Doors.(1991).BDRip.720p.AC3.X264-HiS@SiLUHD-English.[sharethefiles.com].mkv')
@pytest.mark.benchmark(
+13 -13
View File
@@ -16,35 +16,35 @@ def test_main_no_args():
def test_main():
main([u'Fear.and.Loathing.in.Las.Vegas.FRENCH.ENGLISH.720p.HDDVD.DTS.x264-ESiR.mkv'])
def test_main_unicode():
main([u'[阿维达].Avida.2006.FRENCH.DVDRiP.XViD-PROD.avi'])
def test_main_non_unicode():
main(['Fear.and.Loathing.in.Las.Vegas.FRENCH.ENGLISH.720p.HDDVD.DTS.x264-ESiR.mkv'])
def test_main_unicode():
main(['[阿维达].Avida.2006.FRENCH.DVDRiP.XViD-PROD.avi'])
def test_main_forced_unicode():
main([u'Fear.and.Loathing.in.Las.Vegas.FRENCH.ENGLISH.720p.HDDVD.DTS.x264-ESiR.mkv'])
def test_main_verbose():
main([u'Fear.and.Loathing.in.Las.Vegas.FRENCH.ENGLISH.720p.HDDVD.DTS.x264-ESiR.mkv', '--verbose'])
main(['Fear.and.Loathing.in.Las.Vegas.FRENCH.ENGLISH.720p.HDDVD.DTS.x264-ESiR.mkv', '--verbose'])
def test_main_yaml():
main([u'Fear.and.Loathing.in.Las.Vegas.FRENCH.ENGLISH.720p.HDDVD.DTS.x264-ESiR.mkv', '--yaml'])
main(['Fear.and.Loathing.in.Las.Vegas.FRENCH.ENGLISH.720p.HDDVD.DTS.x264-ESiR.mkv', '--yaml'])
def test_main_json():
main([u'Fear.and.Loathing.in.Las.Vegas.FRENCH.ENGLISH.720p.HDDVD.DTS.x264-ESiR.mkv', '--json'])
main(['Fear.and.Loathing.in.Las.Vegas.FRENCH.ENGLISH.720p.HDDVD.DTS.x264-ESiR.mkv', '--json'])
def test_main_show_property():
main([u'Fear.and.Loathing.in.Las.Vegas.FRENCH.ENGLISH.720p.HDDVD.DTS.x264-ESiR.mkv', '-P', 'title'])
main(['Fear.and.Loathing.in.Las.Vegas.FRENCH.ENGLISH.720p.HDDVD.DTS.x264-ESiR.mkv', '-P', 'title'])
def test_main_advanced():
main([u'Fear.and.Loathing.in.Las.Vegas.FRENCH.ENGLISH.720p.HDDVD.DTS.x264-ESiR.mkv', '-a'])
main(['Fear.and.Loathing.in.Las.Vegas.FRENCH.ENGLISH.720p.HDDVD.DTS.x264-ESiR.mkv', '-a'])
def test_main_input():
+26 -13
View File
@@ -169,21 +169,34 @@ class TestYml(object):
for string, expected in data.items():
TestYml.set_default(expected, default)
if not isinstance(string, six.text_type):
string = six.text_type(string)
if not string_predicate or string_predicate(string): # pylint: disable=not-callable
entry = self.check(string, expected)
if entry.ok:
logger.debug(u'[' + filename + '] ' + six.text_type(entry))
elif entry.warning:
logger.warning(u'[' + filename + '] ' + six.text_type(entry))
elif entry.error:
logger.error(u'[' + filename + '] ' + six.text_type(entry))
for line in entry.details:
logger.error(u'[' + filename + '] ' + ' ' * 4 + line)
entries.append(entry)
entry = self.check_data(filename, string, expected)
entries.append(entry)
entries.assert_ok()
def check_data(self, filename, string, expected):
if six.PY2 and isinstance(string, six.text_type):
string = string.encode('utf-8')
converts = []
for k, v in expected.items():
if isinstance(v, six.text_type):
v = v.encode('utf-8')
converts.append((k, v))
for k, v in converts:
expected[k] = v
if not isinstance(string, str):
string = str(string)
if not string_predicate or string_predicate(string): # pylint: disable=not-callable
entry = self.check(string, expected)
if entry.ok:
logger.debug('[' + filename + '] ' + str(entry))
elif entry.warning:
logger.warning('[' + filename + '] ' + str(entry))
elif entry.error:
logger.error('[' + filename + '] ' + str(entry))
for line in entry.details:
logger.error('[' + filename + '] ' + ' ' * 4 + line)
return entry
def check(self, string, expected):
negates, global_, string = self.parse_token_options(string)