From 485ac7141005f3f431c0c458f013225d349e2ae7 Mon Sep 17 00:00:00 2001
From: Toilal <toilal.dev@gmail.com>
Date: Sun, 25 Oct 2015 19:20:49 +0100
Subject: [PATCH] Add more season/episodeNumber support

---
 guessit/__main__.py                       |  4 +++
 guessit/rules/common/formatters.py        | 28 +++++++++------------
 guessit/rules/processors.py               | 28 +++++++++++++--------
 guessit/rules/properties/episode_title.py |  5 ++--
 guessit/rules/properties/episodes.py      | 30 ++++++++++++++++-------
 guessit/rules/properties/other.py         |  6 ++++-
 guessit/rules/properties/title.py         | 30 +++++++++++++++++------
 guessit/test/rules/episodes.yml           | 10 ++++++++
 guessit/test/series.yml                   |  6 +++++
 guessit/test/test_yml.py                  | 10 +++++---
 pytest.ini                                |  2 +-
 11 files changed, 108 insertions(+), 51 deletions(-)

diff --git a/guessit/__main__.py b/guessit/__main__.py
index de9fdb8..fb96a66 100644
--- a/guessit/__main__.py
+++ b/guessit/__main__.py
@@ -5,6 +5,10 @@ Entry point module
 """
 # pragma: no cover
 from __future__ import print_function
+
+from rebulk import debug
+debug.DEBUG = True
+
 from collections import OrderedDict
 import os
 import logging
diff --git a/guessit/rules/common/formatters.py b/guessit/rules/common/formatters.py
index b0b10d8..c0d92d2 100644
--- a/guessit/rules/common/formatters.py
+++ b/guessit/rules/common/formatters.py
@@ -6,6 +6,7 @@ Formatters
 
 from . import seps
 import regex as re
+from rebulk.formatters import formatters
 
 _excluded_clean_chars = ',:;-/\\'
 clean_chars = ""
@@ -38,6 +39,17 @@ def strip(input_string):
     return input_string.strip(seps)
 
 
+def raw_cleanup(raw):
+    """
+    Cleanup a raw value to perform raw comparison
+    :param raw:
+    :type raw:
+    :return:
+    :rtype:
+    """
+    return formatters(cleanup, strip)(raw.lower())
+
+
 def reorder_title(title, articles=('the',), separators=(',', ', ')):
     """
     Reorder the title
@@ -57,19 +69,3 @@ def reorder_title(title, articles=('the',), separators=(',', ', ')):
             if ltitle[-len(suffix):] == suffix:
                 return title[-len(suffix) + len(separator):] + ' ' + title[:-len(suffix)]
     return title
-
-
-def chain(*formatters):
-    """
-    Chain formatter functions
-    :param functions:
-    :type functions:
-    :return:
-    :rtype:
-    """
-    def formatters_chain(input_string):  # pylint:disable=missing-docstring
-        for formatter in formatters:
-            input_string = formatter(input_string)
-        return input_string
-
-    return formatters_chain
diff --git a/guessit/rules/processors.py b/guessit/rules/processors.py
index 048d322..5ff0f9f 100644
--- a/guessit/rules/processors.py
+++ b/guessit/rules/processors.py
@@ -10,7 +10,8 @@ from .common.comparators import marker_sorted
 
 def prefer_last_path(matches):
     """
-    If multiple match are found, keep the one in the most valuable filepart.
+    If multiple match are found with same name, keep the one in the most valuable filepart.
+    Also keep others match with same value than those in mose valuable filepart.
 
     :param matches:
     :param context:
@@ -18,17 +19,24 @@ def prefer_last_path(matches):
     """
     filepart = marker_sorted(matches.markers.named('path'), matches)[0]
     for name in matches.names:
-        named_list = matches.named(name)
-        if len(named_list) > 1:
+        name_matches = matches.named(name)
+        if len(name_matches) > 1:
             keep_list = []
-            for named in named_list:
-                marker = matches.markers.at_match(named, lambda marker: marker is filepart, 0)
+            keep_values = []
+            for name_match in name_matches:
+                marker = matches.markers.at_match(name_match, lambda marker: marker is filepart, 0)
                 if marker:
-                    keep_list.append(named)
+                    keep_list.append(name_match)
+                    keep_values.append(name_match.value)
+
+            for name_match in name_matches:
+                if name_match not in keep_list and name_match.value in keep_values:
+                    keep_list.append(name_match)
+
             if keep_list:
-                for named in named_list:
-                    if named not in keep_list:
-                        matches.remove(named)
+                for name_match in name_matches:
+                    if name_match not in keep_list:
+                        matches.remove(name_match)
 
 
 def enlarge_group_matches(matches):
@@ -53,4 +61,4 @@ def enlarge_group_matches(matches):
             matches.append(match)
 
 
-PROCESSORS = Rebulk().processor(prefer_last_path, enlarge_group_matches)
+PROCESSORS = Rebulk().processor(enlarge_group_matches).post_processor(prefer_last_path)
diff --git a/guessit/rules/properties/episode_title.py b/guessit/rules/properties/episode_title.py
index 9d9cf6a..ff66f5e 100644
--- a/guessit/rules/properties/episode_title.py
+++ b/guessit/rules/properties/episode_title.py
@@ -4,8 +4,9 @@
 Episode title
 """
 from rebulk import Rebulk, AppendMatchRule
+from rebulk.formatters import formatters
 
-from ..common.formatters import cleanup, reorder_title, chain
+from ..common.formatters import cleanup, reorder_title
 
 
 class EpisodeTitleFromPosition(AppendMatchRule):
@@ -19,7 +20,7 @@ class EpisodeTitleFromPosition(AppendMatchRule):
         filename = matches.markers.named('path', -1)
         start, end = filename.span
 
-        holes = matches.holes(start, end + 1, formatter=chain(cleanup, reorder_title),
+        holes = matches.holes(start, end + 1, formatter=formatters(cleanup, reorder_title),
                               predicate=lambda hole: hole.value)
 
         for hole in holes:
diff --git a/guessit/rules/properties/episodes.py b/guessit/rules/properties/episodes.py
index 42623af..cc3c257 100644
--- a/guessit/rules/properties/episodes.py
+++ b/guessit/rules/properties/episodes.py
@@ -9,13 +9,14 @@ from rebulk import Rebulk, RemoveMatchRule
 import regex as re
 from ..common.validators import seps_surround
 from guessit.rules.common import dash
+from ..common.numeral import numeral, parse_numeral
 
 EPISODES = Rebulk().defaults(validate_all=True, validator={'__parent__': seps_surround})
 EPISODES.regex_defaults(flags=re.IGNORECASE, children=True)
 
-EPISODES.regex(r'(?P<season>\d+)x(?P<episodeNumber>\d+)',
-               r'S(?P<season>\d+)[ex](?P<episodeNumber>\d+)',
-               r'S(?P<season>\d+)xe(?P<episodeNumber>\d+)',
+EPISODES.regex(r'(?P<season>\d+)x(?P<episodeNumber>\d+)',  # 01x02
+               r'S(?P<season>\d+)[ex](?P<episodeNumber>\d+)',  # S01E02, S01x02
+               r'S(?P<season>\d+)xe(?P<episodeNumber>\d+)',  # S01Ex02
                formatter=int,
                private_parent=True,
                tags=['SxxExx'],
@@ -24,20 +25,31 @@ EPISODES.regex(r'(?P<season>\d+)x(?P<episodeNumber>\d+)',
                and other.name == 'screenSize'
                else '__default__')
 
+season_words = ['season', 'saison', 'serie', 'seasons', 'saisons', 'series']
+episode_words = ['episode', 'episodes']
+
+EPISODES.regex(r'\L<season_words>-(?P<season>' + numeral + ')', season_words=season_words,  # Season 1, # Season one
+               abbreviations=[dash], formatter=parse_numeral)
+
+season_markers = ['s']
+episode_markers = ['e', 'ep']
+
+
 no_zero_validator = {'__parent__': seps_surround,
                      'season': lambda match: match.value > 0, 'episodeNumber': lambda match: match.value > 0}
 
-
-EPISODES.regex(r'(?P<episodeNumber>\d{2})', tags=['bonus-conflict', 'weak-movie'], formatter=int)
-EPISODES.regex(r'0(?P<episodeNumber>\d{1,2})', tags=['bonus-conflict', 'weak-movie'], formatter=int)
-EPISODES.regex(r'(?P<episodeNumber>\d{3,4})', tags=['bonus-conflict', 'weak-movie'], formatter=int,
+EPISODES.regex(r'(?P<episodeNumber>\d{2})', tags=['bonus-conflict', 'weak-movie'], formatter=int)  # 12
+EPISODES.regex(r'0(?P<episodeNumber>\d{1,2})', tags=['bonus-conflict', 'weak-movie'], formatter=int)  # 02, 012
+EPISODES.regex(r'(?P<episodeNumber>\d{3,4})', tags=['bonus-conflict', 'weak-movie'], formatter=int,  # 112, 113
                validator=no_zero_validator,
                disabled=lambda context: not context.get('episode_prefer_number', False))
 
-EPISODES.regex(r'(?P<season>\d{1})(?P<episodeNumber>\d{2})', tags=['bonus-conflict', 'weak-movie'], formatter=int,
+EPISODES.regex(r'(?P<season>\d{1})(?P<episodeNumber>\d{2})', tags=['bonus-conflict', 'weak-movie'],  # 102
+               formatter=int,
                validator=no_zero_validator,
                disabled=lambda context: context.get('episode_prefer_number', False))
-EPISODES.regex(r'(?P<season>\d{2})(?P<episodeNumber>\d{2})', tags=['bonus-conflict', 'weak-movie'], formatter=int,
+EPISODES.regex(r'(?P<season>\d{2})(?P<episodeNumber>\d{2})', tags=['bonus-conflict', 'weak-movie'],  # 0102
+               formatter=int,
                validator=no_zero_validator,
                conflict_solver=lambda match, other: match if other.name == 'year' else '__default__',
                disabled=lambda context: context.get('episode_prefer_number', False))
diff --git a/guessit/rules/properties/other.py b/guessit/rules/properties/other.py
index 0155f2e..c7c015c 100644
--- a/guessit/rules/properties/other.py
+++ b/guessit/rules/properties/other.py
@@ -11,6 +11,7 @@ import regex as re
 from ..common import dash
 from ..common import seps
 from ..common.validators import seps_surround
+from guessit.rules.common.formatters import raw_cleanup
 
 OTHER = Rebulk().regex_defaults(flags=re.IGNORECASE, abbreviations=[dash]).string_defaults(ignore_case=True)
 OTHER.defaults(name="other", validator=seps_surround)
@@ -88,9 +89,12 @@ def proper_count(matches):
     """
     propers = matches.named('other', lambda match: match.value == 'Proper')
     if propers:
+        raws = {}  # Count distinct raw values
+        for proper in propers:
+            raws[raw_cleanup(proper.raw)] = proper
         proper_count_match = copy.copy(propers[-1])
         proper_count_match.name = 'properCount'
-        proper_count_match.value = len(propers)
+        proper_count_match.value = len(raws)
         matches.append(proper_count_match)
 
 
diff --git a/guessit/rules/properties/title.py b/guessit/rules/properties/title.py
index ca8e6b2..24ebd59 100644
--- a/guessit/rules/properties/title.py
+++ b/guessit/rules/properties/title.py
@@ -4,8 +4,9 @@
 Title
 """
 from rebulk import Rebulk, RemoveMatchRule, AppendRemoveMatchRule
+from rebulk.formatters import formatters
 
-from ..common.formatters import cleanup, reorder_title, chain
+from ..common.formatters import cleanup, reorder_title
 from ..common.comparators import marker_sorted
 from ..common import seps
 from rebulk.rules import AppendRemoveMatchRule
@@ -31,7 +32,7 @@ class TitleFromPosition(AppendRemoveMatchRule):
         """
         start, end = filepart.span
 
-        first_hole = matches.holes(start, end + 1, formatter=chain(cleanup, reorder_title),
+        first_hole = matches.holes(start, end + 1, formatter=formatters(cleanup, reorder_title),
                                    ignore=TitleFromPosition.ignore_language,
                                    predicate=lambda hole: hole.value, index=0)
 
@@ -127,20 +128,33 @@ class PreferTitleWithYear(RemoveMatchRule):
     priority = -255
 
     def when(self, matches, context):
-        with_year = []
-        without_year = []
+        to_keep = []
+        to_remove = []
 
         for title in matches.named('title'):
             filepart = matches.markers.at_match(title, lambda marker: marker.name == 'path', 0)
             if filepart:
                 year_match = matches.range(filepart.start, filepart.end, lambda match: match.name == 'year', 0)
                 if year_match:
-                    with_year.append(title)
+                    to_keep.append(title)
                 else:
-                    without_year.append(title)
+                    to_remove.append(title)
 
-        if with_year:
-            return without_year
+        if to_keep:
+            title_values = set([title.value for title in to_keep])
+            if len(title_values) > 1:
+                # We have distinct values for title with year. Keep only values from most valuable filepart.
+                fileparts = marker_sorted(matches.markers.named('path'), matches)
+                best_title = None
+                for filepart in fileparts:
+                    best_title = matches.range(filepart.start, filepart.end, lambda match: match.name == 'title', 0)
+                    if best_title:
+                        break
+                for title in to_keep:
+                    if title.value != best_title.value:
+                        to_remove.append(title)
+                        to_keep.remove(title)
+            return to_remove
 
 
 TITLE = Rebulk().rules(TitleFromPosition, PreferTitleWithYear)
diff --git a/guessit/test/rules/episodes.yml b/guessit/test/rules/episodes.yml
index 80cf45f..d89c12b 100644
--- a/guessit/test/rules/episodes.yml
+++ b/guessit/test/rules/episodes.yml
@@ -25,3 +25,13 @@
 ? "S03E04 102"
 : season: 3
   episodeNumber: 4
+
+? +serie Saison 2 other
+? +serie Season 2 other
+? +serie Saisons 2 other
+? +serie Seasons 2 other
+? +serie Serie 2 other
+? +serie Series 2 other
+? +serie Season Two other
+? +serie Season II other
+: season: 2
diff --git a/guessit/test/series.yml b/guessit/test/series.yml
index 21ed839..6ecbc15 100644
--- a/guessit/test/series.yml
+++ b/guessit/test/series.yml
@@ -73,3 +73,9 @@
   season: 4
   episodeNumber: 1
   episodeTitle: Fun Run
+
+? Series/Mad Men Season 1 Complete/Mad.Men.S01E01.avi
+: title: Mad Men
+  season: 1
+  episodeNumber: 1
+  other: Complete
diff --git a/guessit/test/test_yml.py b/guessit/test/test_yml.py
index 7cc2aa3..e938aa1 100644
--- a/guessit/test/test_yml.py
+++ b/guessit/test/test_yml.py
@@ -2,6 +2,8 @@
 # -*- coding: utf-8 -*-
 # pylint: disable=no-self-use, pointless-statement, missing-docstring, invalid-name
 import logging
+logger = logging.getLogger(__name__)
+
 from collections import OrderedDict
 
 import babelfish
@@ -192,13 +194,13 @@ class TestYml(object):
             if not string_predicate or string_predicate(string):  # pylint: disable=not-callable
                 entry = self.check(string, expected)
                 if entry.ok:
-                    logging.debug(u'[' + filename + '] ' + six.text_type(entry))
+                    logger.debug(u'[' + filename + '] ' + six.text_type(entry))
                 elif entry.warning:
-                    logging.warning(u'[' + filename + '] ' + six.text_type(entry))
+                    logger.warning(u'[' + filename + '] ' + six.text_type(entry))
                 elif entry.error:
-                    logging.error(u'[' + filename + '] ' + six.text_type(entry))
+                    logger.error(u'[' + filename + '] ' + six.text_type(entry))
                     for line in entry.details:
-                        logging.error(u'[' + filename + '] ' + ' ' * 4 + line)
+                        logger.error(u'[' + filename + '] ' + ' ' * 4 + line)
                 entries.append(entry)
         entries.assert_ok()
 
diff --git a/pytest.ini b/pytest.ini
index 6159c20..fc08a50 100644
--- a/pytest.ini
+++ b/pytest.ini
@@ -1,2 +1,2 @@
 [pytest]
-addopts = --ignore=setup.py --doctest-modules --doctest-glob='README.rst'
+addopts =-s --ignore=setup.py --doctest-modules --doctest-glob='README.rst'