From 485ac7141005f3f431c0c458f013225d349e2ae7 Mon Sep 17 00:00:00 2001 From: Toilal Date: Sun, 25 Oct 2015 19:20:49 +0100 Subject: [PATCH] Add more season/episodeNumber support --- guessit/__main__.py | 4 +++ guessit/rules/common/formatters.py | 28 +++++++++------------ guessit/rules/processors.py | 28 +++++++++++++-------- guessit/rules/properties/episode_title.py | 5 ++-- guessit/rules/properties/episodes.py | 30 ++++++++++++++++------- guessit/rules/properties/other.py | 6 ++++- guessit/rules/properties/title.py | 30 +++++++++++++++++------ guessit/test/rules/episodes.yml | 10 ++++++++ guessit/test/series.yml | 6 +++++ guessit/test/test_yml.py | 10 +++++--- pytest.ini | 2 +- 11 files changed, 108 insertions(+), 51 deletions(-) diff --git a/guessit/__main__.py b/guessit/__main__.py index de9fdb8..fb96a66 100644 --- a/guessit/__main__.py +++ b/guessit/__main__.py @@ -5,6 +5,10 @@ Entry point module """ # pragma: no cover from __future__ import print_function + +from rebulk import debug +debug.DEBUG = True + from collections import OrderedDict import os import logging diff --git a/guessit/rules/common/formatters.py b/guessit/rules/common/formatters.py index b0b10d8..c0d92d2 100644 --- a/guessit/rules/common/formatters.py +++ b/guessit/rules/common/formatters.py @@ -6,6 +6,7 @@ Formatters from . import seps import regex as re +from rebulk.formatters import formatters _excluded_clean_chars = ',:;-/\\' clean_chars = "" @@ -38,6 +39,17 @@ def strip(input_string): return input_string.strip(seps) +def raw_cleanup(raw): + """ + Cleanup a raw value to perform raw comparison + :param raw: + :type raw: + :return: + :rtype: + """ + return formatters(cleanup, strip)(raw.lower()) + + def reorder_title(title, articles=('the',), separators=(',', ', ')): """ Reorder the title @@ -57,19 +69,3 @@ def reorder_title(title, articles=('the',), separators=(',', ', ')): if ltitle[-len(suffix):] == suffix: return title[-len(suffix) + len(separator):] + ' ' + title[:-len(suffix)] return title - - -def chain(*formatters): - """ - Chain formatter functions - :param functions: - :type functions: - :return: - :rtype: - """ - def formatters_chain(input_string): # pylint:disable=missing-docstring - for formatter in formatters: - input_string = formatter(input_string) - return input_string - - return formatters_chain diff --git a/guessit/rules/processors.py b/guessit/rules/processors.py index 048d322..5ff0f9f 100644 --- a/guessit/rules/processors.py +++ b/guessit/rules/processors.py @@ -10,7 +10,8 @@ from .common.comparators import marker_sorted def prefer_last_path(matches): """ - If multiple match are found, keep the one in the most valuable filepart. + If multiple match are found with same name, keep the one in the most valuable filepart. + Also keep others match with same value than those in mose valuable filepart. :param matches: :param context: @@ -18,17 +19,24 @@ def prefer_last_path(matches): """ filepart = marker_sorted(matches.markers.named('path'), matches)[0] for name in matches.names: - named_list = matches.named(name) - if len(named_list) > 1: + name_matches = matches.named(name) + if len(name_matches) > 1: keep_list = [] - for named in named_list: - marker = matches.markers.at_match(named, lambda marker: marker is filepart, 0) + keep_values = [] + for name_match in name_matches: + marker = matches.markers.at_match(name_match, lambda marker: marker is filepart, 0) if marker: - keep_list.append(named) + keep_list.append(name_match) + keep_values.append(name_match.value) + + for name_match in name_matches: + if name_match not in keep_list and name_match.value in keep_values: + keep_list.append(name_match) + if keep_list: - for named in named_list: - if named not in keep_list: - matches.remove(named) + for name_match in name_matches: + if name_match not in keep_list: + matches.remove(name_match) def enlarge_group_matches(matches): @@ -53,4 +61,4 @@ def enlarge_group_matches(matches): matches.append(match) -PROCESSORS = Rebulk().processor(prefer_last_path, enlarge_group_matches) +PROCESSORS = Rebulk().processor(enlarge_group_matches).post_processor(prefer_last_path) diff --git a/guessit/rules/properties/episode_title.py b/guessit/rules/properties/episode_title.py index 9d9cf6a..ff66f5e 100644 --- a/guessit/rules/properties/episode_title.py +++ b/guessit/rules/properties/episode_title.py @@ -4,8 +4,9 @@ Episode title """ from rebulk import Rebulk, AppendMatchRule +from rebulk.formatters import formatters -from ..common.formatters import cleanup, reorder_title, chain +from ..common.formatters import cleanup, reorder_title class EpisodeTitleFromPosition(AppendMatchRule): @@ -19,7 +20,7 @@ class EpisodeTitleFromPosition(AppendMatchRule): filename = matches.markers.named('path', -1) start, end = filename.span - holes = matches.holes(start, end + 1, formatter=chain(cleanup, reorder_title), + holes = matches.holes(start, end + 1, formatter=formatters(cleanup, reorder_title), predicate=lambda hole: hole.value) for hole in holes: diff --git a/guessit/rules/properties/episodes.py b/guessit/rules/properties/episodes.py index 42623af..cc3c257 100644 --- a/guessit/rules/properties/episodes.py +++ b/guessit/rules/properties/episodes.py @@ -9,13 +9,14 @@ from rebulk import Rebulk, RemoveMatchRule import regex as re from ..common.validators import seps_surround from guessit.rules.common import dash +from ..common.numeral import numeral, parse_numeral EPISODES = Rebulk().defaults(validate_all=True, validator={'__parent__': seps_surround}) EPISODES.regex_defaults(flags=re.IGNORECASE, children=True) -EPISODES.regex(r'(?P\d+)x(?P\d+)', - r'S(?P\d+)[ex](?P\d+)', - r'S(?P\d+)xe(?P\d+)', +EPISODES.regex(r'(?P\d+)x(?P\d+)', # 01x02 + r'S(?P\d+)[ex](?P\d+)', # S01E02, S01x02 + r'S(?P\d+)xe(?P\d+)', # S01Ex02 formatter=int, private_parent=True, tags=['SxxExx'], @@ -24,20 +25,31 @@ EPISODES.regex(r'(?P\d+)x(?P\d+)', and other.name == 'screenSize' else '__default__') +season_words = ['season', 'saison', 'serie', 'seasons', 'saisons', 'series'] +episode_words = ['episode', 'episodes'] + +EPISODES.regex(r'\L-(?P' + numeral + ')', season_words=season_words, # Season 1, # Season one + abbreviations=[dash], formatter=parse_numeral) + +season_markers = ['s'] +episode_markers = ['e', 'ep'] + + no_zero_validator = {'__parent__': seps_surround, 'season': lambda match: match.value > 0, 'episodeNumber': lambda match: match.value > 0} - -EPISODES.regex(r'(?P\d{2})', tags=['bonus-conflict', 'weak-movie'], formatter=int) -EPISODES.regex(r'0(?P\d{1,2})', tags=['bonus-conflict', 'weak-movie'], formatter=int) -EPISODES.regex(r'(?P\d{3,4})', tags=['bonus-conflict', 'weak-movie'], formatter=int, +EPISODES.regex(r'(?P\d{2})', tags=['bonus-conflict', 'weak-movie'], formatter=int) # 12 +EPISODES.regex(r'0(?P\d{1,2})', tags=['bonus-conflict', 'weak-movie'], formatter=int) # 02, 012 +EPISODES.regex(r'(?P\d{3,4})', tags=['bonus-conflict', 'weak-movie'], formatter=int, # 112, 113 validator=no_zero_validator, disabled=lambda context: not context.get('episode_prefer_number', False)) -EPISODES.regex(r'(?P\d{1})(?P\d{2})', tags=['bonus-conflict', 'weak-movie'], formatter=int, +EPISODES.regex(r'(?P\d{1})(?P\d{2})', tags=['bonus-conflict', 'weak-movie'], # 102 + formatter=int, validator=no_zero_validator, disabled=lambda context: context.get('episode_prefer_number', False)) -EPISODES.regex(r'(?P\d{2})(?P\d{2})', tags=['bonus-conflict', 'weak-movie'], formatter=int, +EPISODES.regex(r'(?P\d{2})(?P\d{2})', tags=['bonus-conflict', 'weak-movie'], # 0102 + formatter=int, validator=no_zero_validator, conflict_solver=lambda match, other: match if other.name == 'year' else '__default__', disabled=lambda context: context.get('episode_prefer_number', False)) diff --git a/guessit/rules/properties/other.py b/guessit/rules/properties/other.py index 0155f2e..c7c015c 100644 --- a/guessit/rules/properties/other.py +++ b/guessit/rules/properties/other.py @@ -11,6 +11,7 @@ import regex as re from ..common import dash from ..common import seps from ..common.validators import seps_surround +from guessit.rules.common.formatters import raw_cleanup OTHER = Rebulk().regex_defaults(flags=re.IGNORECASE, abbreviations=[dash]).string_defaults(ignore_case=True) OTHER.defaults(name="other", validator=seps_surround) @@ -88,9 +89,12 @@ def proper_count(matches): """ propers = matches.named('other', lambda match: match.value == 'Proper') if propers: + raws = {} # Count distinct raw values + for proper in propers: + raws[raw_cleanup(proper.raw)] = proper proper_count_match = copy.copy(propers[-1]) proper_count_match.name = 'properCount' - proper_count_match.value = len(propers) + proper_count_match.value = len(raws) matches.append(proper_count_match) diff --git a/guessit/rules/properties/title.py b/guessit/rules/properties/title.py index ca8e6b2..24ebd59 100644 --- a/guessit/rules/properties/title.py +++ b/guessit/rules/properties/title.py @@ -4,8 +4,9 @@ Title """ from rebulk import Rebulk, RemoveMatchRule, AppendRemoveMatchRule +from rebulk.formatters import formatters -from ..common.formatters import cleanup, reorder_title, chain +from ..common.formatters import cleanup, reorder_title from ..common.comparators import marker_sorted from ..common import seps from rebulk.rules import AppendRemoveMatchRule @@ -31,7 +32,7 @@ class TitleFromPosition(AppendRemoveMatchRule): """ start, end = filepart.span - first_hole = matches.holes(start, end + 1, formatter=chain(cleanup, reorder_title), + first_hole = matches.holes(start, end + 1, formatter=formatters(cleanup, reorder_title), ignore=TitleFromPosition.ignore_language, predicate=lambda hole: hole.value, index=0) @@ -127,20 +128,33 @@ class PreferTitleWithYear(RemoveMatchRule): priority = -255 def when(self, matches, context): - with_year = [] - without_year = [] + to_keep = [] + to_remove = [] for title in matches.named('title'): filepart = matches.markers.at_match(title, lambda marker: marker.name == 'path', 0) if filepart: year_match = matches.range(filepart.start, filepart.end, lambda match: match.name == 'year', 0) if year_match: - with_year.append(title) + to_keep.append(title) else: - without_year.append(title) + to_remove.append(title) - if with_year: - return without_year + if to_keep: + title_values = set([title.value for title in to_keep]) + if len(title_values) > 1: + # We have distinct values for title with year. Keep only values from most valuable filepart. + fileparts = marker_sorted(matches.markers.named('path'), matches) + best_title = None + for filepart in fileparts: + best_title = matches.range(filepart.start, filepart.end, lambda match: match.name == 'title', 0) + if best_title: + break + for title in to_keep: + if title.value != best_title.value: + to_remove.append(title) + to_keep.remove(title) + return to_remove TITLE = Rebulk().rules(TitleFromPosition, PreferTitleWithYear) diff --git a/guessit/test/rules/episodes.yml b/guessit/test/rules/episodes.yml index 80cf45f..d89c12b 100644 --- a/guessit/test/rules/episodes.yml +++ b/guessit/test/rules/episodes.yml @@ -25,3 +25,13 @@ ? "S03E04 102" : season: 3 episodeNumber: 4 + +? +serie Saison 2 other +? +serie Season 2 other +? +serie Saisons 2 other +? +serie Seasons 2 other +? +serie Serie 2 other +? +serie Series 2 other +? +serie Season Two other +? +serie Season II other +: season: 2 diff --git a/guessit/test/series.yml b/guessit/test/series.yml index 21ed839..6ecbc15 100644 --- a/guessit/test/series.yml +++ b/guessit/test/series.yml @@ -73,3 +73,9 @@ season: 4 episodeNumber: 1 episodeTitle: Fun Run + +? Series/Mad Men Season 1 Complete/Mad.Men.S01E01.avi +: title: Mad Men + season: 1 + episodeNumber: 1 + other: Complete diff --git a/guessit/test/test_yml.py b/guessit/test/test_yml.py index 7cc2aa3..e938aa1 100644 --- a/guessit/test/test_yml.py +++ b/guessit/test/test_yml.py @@ -2,6 +2,8 @@ # -*- coding: utf-8 -*- # pylint: disable=no-self-use, pointless-statement, missing-docstring, invalid-name import logging +logger = logging.getLogger(__name__) + from collections import OrderedDict import babelfish @@ -192,13 +194,13 @@ class TestYml(object): if not string_predicate or string_predicate(string): # pylint: disable=not-callable entry = self.check(string, expected) if entry.ok: - logging.debug(u'[' + filename + '] ' + six.text_type(entry)) + logger.debug(u'[' + filename + '] ' + six.text_type(entry)) elif entry.warning: - logging.warning(u'[' + filename + '] ' + six.text_type(entry)) + logger.warning(u'[' + filename + '] ' + six.text_type(entry)) elif entry.error: - logging.error(u'[' + filename + '] ' + six.text_type(entry)) + logger.error(u'[' + filename + '] ' + six.text_type(entry)) for line in entry.details: - logging.error(u'[' + filename + '] ' + ' ' * 4 + line) + logger.error(u'[' + filename + '] ' + ' ' * 4 + line) entries.append(entry) entries.assert_ok() diff --git a/pytest.ini b/pytest.ini index 6159c20..fc08a50 100644 --- a/pytest.ini +++ b/pytest.ini @@ -1,2 +1,2 @@ [pytest] -addopts = --ignore=setup.py --doctest-modules --doctest-glob='README.rst' +addopts =-s --ignore=setup.py --doctest-modules --doctest-glob='README.rst'