diff --git a/Python/libraries/datatypes-timex-expression/setup.py b/Python/libraries/datatypes-timex-expression/setup.py index 2e39a36a32..911747ecfa 100644 --- a/Python/libraries/datatypes-timex-expression/setup.py +++ b/Python/libraries/datatypes-timex-expression/setup.py @@ -11,7 +11,7 @@ def read(fname): NAME = 'datatypes_timex_expression_genesys' -VERSION = '1.1.9' +VERSION = '1.1.13a2' REQUIRES = [] setup( diff --git a/Python/libraries/recognizers-choice/setup.py b/Python/libraries/recognizers-choice/setup.py index b2cd09a8b4..b1bd8d8efc 100644 --- a/Python/libraries/recognizers-choice/setup.py +++ b/Python/libraries/recognizers-choice/setup.py @@ -11,7 +11,7 @@ def read(fname): NAME = 'recognizers-text-choice-genesys' -VERSION = '1.1.9' +VERSION = '1.1.13a2' REQUIRES = ['recognizers-text-genesys', 'regex', 'grapheme'] setup( diff --git a/Python/libraries/recognizers-date-time/recognizers_date_time/date_time/base_minimal_configs.py b/Python/libraries/recognizers-date-time/recognizers_date_time/date_time/base_minimal_configs.py index 45ca9fe97b..b02e1ac5e3 100644 --- a/Python/libraries/recognizers-date-time/recognizers_date_time/date_time/base_minimal_configs.py +++ b/Python/libraries/recognizers-date-time/recognizers_date_time/date_time/base_minimal_configs.py @@ -8,7 +8,7 @@ from .utilities import DateTimeUtilityConfiguration -class MinimalBaseDateParserConfiguration(ABC): +class BaseMinimalDateParserConfiguration(ABC): @property @abstractmethod def cardinal_extractor(self) -> BaseNumberExtractor: diff --git a/Python/libraries/recognizers-date-time/recognizers_date_time/date_time/base_minimal_merged.py b/Python/libraries/recognizers-date-time/recognizers_date_time/date_time/base_minimal_merged.py index 9b9bdaad27..8c5c773cce 100644 --- a/Python/libraries/recognizers-date-time/recognizers_date_time/date_time/base_minimal_merged.py +++ b/Python/libraries/recognizers-date-time/recognizers_date_time/date_time/base_minimal_merged.py @@ -1,22 +1,16 @@ from abc import abstractmethod, ABC -from typing import List, Optional, Pattern, Dict +from typing import List, Pattern from datetime import datetime -from collections import namedtuple -import regex from recognizers_text.extractor import Extractor, ExtractResult -from .constants import Constants, TimeTypeConstants from .extractors import DateTimeExtractor -from .parsers import DateTimeParser, DateTimeParseResult from .base_date import BaseDateParser from .base_time import BaseTimeParser -from .utilities import Token, merge_all_tokens, DateTimeOptions, DateTimeFormatUtil, DateUtils, RegExpUtility, TimexUtil - -MatchedIndex = namedtuple('MatchedIndex', ['matched', 'index']) +from .utilities import DateTimeOptions +from recognizers_date_time.date_time.base_merged import BaseMergedExtractor, BaseMergedParser class MinimalMergedExtractorConfiguration: - @property @abstractmethod def ambiguous_range_modifier_prefix(self) -> Pattern: @@ -58,14 +52,10 @@ def check_both_before_after(self): raise NotImplementedError -class MinimalMergedExtractor(DateTimeExtractor): - @property - def extractor_type_name(self) -> str: - return Constants.SYS_DATETIME_MERGED +class MinimalMergedExtractor(BaseMergedExtractor): def __init__(self, config: MinimalMergedExtractorConfiguration, options: DateTimeOptions): - self.config = config - self.options = options + super().__init__(config, options) def extract(self, source: str, reference: datetime = None) -> List[ExtractResult]: if reference is None: @@ -89,58 +79,6 @@ def extract(self, source: str, reference: datetime = None) -> List[ExtractResult return result - def add_to(self, destinations: List[ExtractResult], source: List[ExtractResult], text: str) -> List[ExtractResult]: - for value in source: - is_found = False - overlap_indexes: List[int] = list() - first_index = -1 - - for index, destination in enumerate(destinations): - if destination.overlap(value): - is_found = True - if destination.cover(value): - if first_index == -1: - first_index = index - overlap_indexes.append(index) - else: - continue - - if not is_found: - destinations.append(value) - elif overlap_indexes: - temp_dst: List[ExtractResult] = list() - - for index, destination in enumerate(destinations): - if index not in overlap_indexes: - temp_dst.append(destination) - - # insert at the first overlap occurence to keep the order - temp_dst.insert(first_index, value) - destinations = temp_dst - return destinations - - def number_ending_regex_match(self, source: str, extract_results: List[ExtractResult]) -> List[ExtractResult]: - tokens: List[Token] = list() - - for extract_result in extract_results: - if extract_result.type in [Constants.SYS_DATETIME_TIME, Constants.SYS_DATETIME_DATETIME]: - after_str = source[extract_result.start + - extract_result.length:] - match = regex.search( - self.config.number_ending_pattern, after_str) - if match: - new_time = RegExpUtility.get_group(match, Constants.NEW_TIME) - num_res = self.config.integer_extractor.extract(new_time) - if not num_res: - continue - - start_position = extract_result.start + \ - extract_result.length + match.group().index(new_time) - tokens.append( - Token(start_position, start_position + len(new_time))) - - return merge_all_tokens(tokens, source, Constants.SYS_DATETIME_TIME) - class MinimalMergedParserConfiguration(ABC): @property @@ -194,397 +132,7 @@ def date_time_parser(self) -> BaseTimeParser: raise NotImplementedError -class MinimalMergedParser(DateTimeParser): - @property - def parser_type_name(self) -> str: - return Constants.SYS_DATETIME_MERGED +class MinimalMergedParser(BaseMergedParser): def __init__(self, config: MinimalMergedParserConfiguration, options: DateTimeOptions): - self.__date_min_value = DateTimeFormatUtil.format_date( - DateUtils.min_value) - self.__date_time_min_value = DateTimeFormatUtil.format_date_time( - DateUtils.min_value) - self.config = config - self.options = options - - def parse(self, source: ExtractResult, reference: datetime = None) -> Optional[DateTimeParseResult]: - if not reference: - reference = datetime.now() - - # Push, save the MOD string - has_before = False - has_after = False - has_since = False - has_around = False - has_equal = False - has_date_after = False - match_is_after = False - - # "inclusive_mod" means MOD should include the start/end time - # For example, cases like "on or later than", "earlier than or in" have inclusive modifier - has_inclusive_mod = False - mod_str = '' - - result = self.parse_result(source, reference) - if not result: - return None - - # Pop, restore the MOD string - if has_before and result.value: - result.length += len(mod_str) - result.start -= 0 if match_is_after else len(mod_str) - result.text = result.text + mod_str if match_is_after else mod_str + result.text - val = result.value - - val.mod = self.combine_mod(val.mod, TimeTypeConstants.BEFORE_MOD if not has_inclusive_mod else - TimeTypeConstants.UNTIL_MOD) - if has_around: - val.mod = self.combine_mod(TimeTypeConstants.APPROX_MOD, val.mod) - has_around = False - result.value = val - - if has_after and result.value: - result.length += len(mod_str) - result.start -= len(mod_str) - result.text = mod_str + result.text - val = result.value - - val.mod = self.combine_mod(val.mod, TimeTypeConstants.AFTER_MOD if not has_inclusive_mod else - TimeTypeConstants.SINCE_MOD) - if has_around: - val.mod = self.combine_mod(TimeTypeConstants.APPROX_MOD, val.mod) - has_around = False - result.value = val - - if has_since and result.value: - result.length += len(mod_str) - result.start -= len(mod_str) - result.text = mod_str + result.text - val = result.value - val.mod = TimeTypeConstants.SINCE_MOD - if has_around: - val.mod = self.combine_mod(TimeTypeConstants.APPROX_MOD, val.mod) - has_around = False - result.value = val - - if has_around and result.value: - result.length += len(mod_str) - result.start -= len(mod_str) - result.text = mod_str + result.text - val = result.value - val.mod = TimeTypeConstants.APPROX_MOD - result.value = val - - if has_equal and result.value: - result.length += len(mod_str) - result.start -= len(mod_str) - result.text = mod_str + result.text - - if has_date_after and result.value: - result.length += len(mod_str) - result.text = result.text + mod_str - val = result.value - val.mod = self.combine_mod(val.mod, TimeTypeConstants.SINCE_MOD) - result.value = val - has_since = True - - # For cases like "3 pm or later on monday" - match = self.config.suffix_after.match(result.text) - if result.value and (match.start() != 0 if match else match) and \ - result.type == Constants.SYS_DATETIME_DATETIME: - val = result.value - val.mod = self.combine_mod(val.mod, TimeTypeConstants.SINCE_MOD) - result.value = val - has_since = True - - if self.options & DateTimeOptions.SPLIT_DATE_AND_TIME and result.value and result.value.sub_date_time_entities: - result.value = self._date_time_resolution_for_split(result) - else: - result = self.set_parse_result( - result, has_before, has_after, has_since) - - return result - - def parse_result(self, source: ExtractResult, reference: datetime): - if source.type == Constants.SYS_DATETIME_DATE: - result = self.config.date_parser.parse(source, reference) - elif source.type == Constants.SYS_DATETIME_TIME: - result = self.config.time_parser.parse(source, reference) - elif source.type == Constants.SYS_DATETIME_DATETIME: - result = self.config.date_time_parser.parse(source, reference) - else: - return None - - return result - - @staticmethod - def combine_mod(original_mod: str, new_mod: str): - combined_mod = new_mod - - if original_mod: - combined_mod = f"{new_mod}-{original_mod}" - - return combined_mod - - def set_parse_result(self, slot: DateTimeParseResult, has_before: bool, has_after: bool, has_since: bool)\ - -> DateTimeParseResult: - slot.value = self._date_time_resolution( - slot, has_before, has_after, has_since) - slot.type = f'{self.parser_type_name}.' \ - f'{self._determine_date_time_types(slot.type, has_before, has_after, has_since)}' - return slot - - def _get_parse_result(self, extractor_result: Extractor, reference: datetime) -> DateTimeParseResult: - extractor_type = extractor_result.type - if extractor_type == Constants.SYS_DATETIME_DATE: - result = self.config.date_parser.parse(extractor_result, reference) - return result - elif extractor_type == Constants.SYS_DATETIME_TIME: - return self.config.time_parser.parse(extractor_result, reference) - else: - return None - - def _determine_date_time_types(self, dtype: str, has_before: bool, has_after: bool, has_since: bool) -> str: - if self.options & DateTimeOptions.SPLIT_DATE_AND_TIME: - if dtype == Constants.SYS_DATETIME_DATETIME: - return Constants.SYS_DATETIME_TIME - else: - if has_before or has_after or has_since: - if dtype == Constants.SYS_DATETIME_DATE: - return Constants.SYS_DATETIME_DATEPERIOD - - if dtype == Constants.SYS_DATETIME_TIME: - return Constants.SYS_DATETIME_TIMEPERIOD - - if dtype == Constants.SYS_DATETIME_DATETIME: - return Constants.SYS_DATETIME_DATETIMEPERIOD - return dtype - - def _determine_source_entity_type(self, source_type: str, new_type: str, has_mod: bool) -> Optional[str]: - if not has_mod: - return None - - if new_type != source_type: - return Constants.SYS_DATETIME_DATETIMEPOINT - - if new_type == Constants.SYS_DATETIME_DATEPERIOD: - return Constants.SYS_DATETIME_DATETIMEPERIOD - - def _date_time_resolution_for_split(self, slot: DateTimeParseResult) -> List[DateTimeParseResult]: - results = [] - if slot.value.sub_date_time_entities: - sub_entities = slot.value.sub_date_time_entities - - for sub_entity in sub_entities: - result = sub_entity - result.start += slot.start - results += self._date_time_resolution_for_split(result) - else: - slot.value = self._date_time_resolution(slot, False, False, False) - slot.type = f'{self.parser_type_name}.{self._determine_date_time_types(slot.type, False, False, False)}' - results.append(slot) - - return results - - def _date_time_resolution(self, slot: DateTimeParseResult, has_before, has_after, has_since) ->\ - List[Dict[str, str]]: - if not slot: - return None - - result = dict() - resolutions = [] - - dtype = slot.type - output_type = self._determine_date_time_types(dtype, has_before, has_after, has_since) - source_entity = self._determine_source_entity_type(dtype, output_type, has_before or has_after or has_since) - - timex = slot.timex_str - - value = slot.value - if not value: - return None - - is_lunar = value.is_lunar - mod = value.mod - comment = value.comment - - self._add_resolution_fields_any(result, Constants.TIMEX_KEY, timex) - self._add_resolution_fields_any(result, Constants.COMMENT_KEY, comment) - self._add_resolution_fields_any(result, Constants.MOD_KEY, mod) - self._add_resolution_fields_any(result, Constants.TYPE_KEY, output_type) - self._add_resolution_fields_any( - result, Constants.IS_LUNAR_KEY, str(is_lunar).lower() if is_lunar else '') - - future_resolution = value.future_resolution - past_resolution = value.past_resolution - - future = self._generate_from_resolution(dtype, future_resolution, mod) - past = self._generate_from_resolution(dtype, past_resolution, mod) - - future_values = sorted(future.values()) - past_values = sorted(past.values()) - intersect_values = [i for i, j in zip( - future_values, past_values) if i == j] - - if len(intersect_values) == len(past_values) and len(intersect_values) == len(future_values): - if past_values: - self._add_resolution_fields_any( - result, Constants.RESOLVE_KEY, past) - else: - if past_values: - self._add_resolution_fields_any( - result, Constants.RESOLVE_TO_PAST_KEY, past) - if future_values: - self._add_resolution_fields_any( - result, Constants.RESOLVE_TO_FUTURE_KEY, future) - - if comment == Constants.AM_PM_GROUP_NAME: - if Constants.RESOLVE_KEY in result: - self._resolve_ampm(result, Constants.RESOLVE_KEY) - else: - self._resolve_ampm(result, Constants.RESOLVE_TO_PAST_KEY) - self._resolve_ampm(result, Constants.RESOLVE_TO_FUTURE_KEY) - - if TimexUtil._has_double_timex(comment): - TimexUtil._process_double_timex(result, Constants.RESOLVE_TO_FUTURE_KEY, Constants.RESOLVE_TO_PAST_KEY, timex) - - for value in result.values(): - if isinstance(value, dict): - new_values = {} - self._add_resolution_fields( - new_values, Constants.TIMEX_KEY, timex) - self._add_resolution_fields(new_values, Constants.MOD_KEY, mod) - - self._add_resolution_fields(new_values, Constants.TYPE_KEY, output_type) - self._add_resolution_fields(new_values, Constants.IS_LUNAR_KEY, - str(is_lunar).lower() if is_lunar else '') - self._add_resolution_fields(new_values, Constants.SOURCE_TYPE, source_entity) - - for inner_key in value: - new_values[inner_key] = value[inner_key] - - resolutions.append(new_values) - - if not past and not future: - o = {} - o['timex'] = timex - o['type'] = output_type - o['value'] = 'not resolved' - resolutions.append(o) - - return {'values': resolutions} - - def _add_resolution_fields_any(self, dic: Dict[str, str], key: str, value: object): - if isinstance(value, str): - if value: - dic[key] = value - else: - dic[key] = value - - def _add_resolution_fields(self, dic: [str, str], key: str, value: str): - if value: - dic[key] = value - - def _generate_from_resolution(self, dtype: str, resolution: Dict[str, str], mod: str) -> Dict[str, str]: - result = {} - - if dtype == Constants.SYS_DATETIME_DATETIME: - self.__add_single_date_time_to_resolution( - resolution, TimeTypeConstants.DATETIME, mod, result) - elif dtype == Constants.SYS_DATETIME_TIME: - self.__add_single_date_time_to_resolution( - resolution, TimeTypeConstants.TIME, mod, result) - elif dtype == Constants.SYS_DATETIME_DATE: - self.__add_single_date_time_to_resolution( - resolution, TimeTypeConstants.DATE, mod, result) - - return result - - def __add_single_date_time_to_resolution(self, resolutions: Dict[str, str], dtype: str, - mod: str, result: Dict[str, str]): - key = TimeTypeConstants.VALUE - value = resolutions[dtype] - if not value or value.startswith(self.__date_min_value): - return - - if mod: - if mod.startswith(TimeTypeConstants.BEFORE_MOD): - key = TimeTypeConstants.END - elif mod.startswith(TimeTypeConstants.AFTER_MOD): - key = TimeTypeConstants.START - elif mod.startswith(TimeTypeConstants.SINCE_MOD): - key = TimeTypeConstants.START - elif mod.startswith(TimeTypeConstants.UNTIL_MOD): - key = TimeTypeConstants.END - - result[key] = value - - def __add_period_to_resolution(self, resolutions: Dict[str, str], start_type: str, - end_type: str, mod: str, result: Dict[str, str]): - start = resolutions.get(start_type, None) - end = resolutions.get(end_type, None) - if mod: - if mod.startswith(TimeTypeConstants.BEFORE_MOD): - if mod.endswith(TimeTypeConstants.LATE_MOD): - result[TimeTypeConstants.END] = end - else: - result[TimeTypeConstants.END] = start - return - if mod.startswith(TimeTypeConstants.AFTER_MOD): - if mod.endswith(TimeTypeConstants.EARLY_MOD): - result[TimeTypeConstants.START] = start - else: - result[TimeTypeConstants.START] = end - return - if mod == TimeTypeConstants.SINCE_MOD: - result[TimeTypeConstants.START] = start - return - - if not (start and end): - return - - if start.startswith(Constants.INVALID_DATE_STRING) or end.startswith(Constants.INVALID_DATE_STRING): - return - - result[TimeTypeConstants.START] = start - result[TimeTypeConstants.END] = end - - def _resolve_ampm(self, values_map: Dict[str, str], key_name: str): - if key_name not in values_map: - return - resolution = values_map[key_name] - if Constants.TIMEX_KEY not in values_map: - return - timex = values_map[Constants.TIMEX_KEY] - values_map.pop(key_name, None) - values_map[key_name + Constants.AM_GROUP_NAME] = resolution - - resolution_pm = {} - if values_map[Constants.TYPE_KEY] == Constants.SYS_DATETIME_TIME: - resolution_pm[TimeTypeConstants.VALUE] = DateTimeFormatUtil.to_pm( - resolution[TimeTypeConstants.VALUE]) - resolution_pm[Constants.TIMEX_KEY] = DateTimeFormatUtil.to_pm(timex) - elif values_map[Constants.TYPE_KEY] == Constants.SYS_DATETIME_DATETIME: - split_value = resolution[TimeTypeConstants.VALUE].split(' ') - resolution_pm[ - TimeTypeConstants.VALUE] = f'{split_value[0]} {DateTimeFormatUtil.to_pm(split_value[1])}' - resolution_pm[Constants.TIMEX_KEY] = DateTimeFormatUtil.all_str_to_pm(timex) - elif values_map[Constants.TYPE_KEY] == Constants.SYS_DATETIME_TIMEPERIOD: - if TimeTypeConstants.START in resolution: - resolution_pm[TimeTypeConstants.START] = DateTimeFormatUtil.to_pm( - resolution[TimeTypeConstants.START]) - if TimeTypeConstants.END in resolution: - resolution_pm[TimeTypeConstants.END] = DateTimeFormatUtil.to_pm( - resolution[TimeTypeConstants.END]) - resolution_pm[Constants.TIMEX_KEY] = DateTimeFormatUtil.all_str_to_pm(timex) - elif values_map[Constants.TYPE_KEY] == Constants.SYS_DATETIME_DATETIMEPERIOD: - if TimeTypeConstants.START in resolution: - split_value = resolution[TimeTypeConstants.START].split(' ') - resolution_pm[ - TimeTypeConstants.START] = f'{split_value[0]} {DateTimeFormatUtil.to_pm(split_value[1])}' - if TimeTypeConstants.END in resolution: - split_value = resolution[TimeTypeConstants.END].split(' ') - resolution_pm[ - TimeTypeConstants.END] = f'{split_value[0]} {DateTimeFormatUtil.to_pm(split_value[1])}' - resolution_pm[Constants.TIMEX_KEY] = DateTimeFormatUtil.all_str_to_pm(timex) - values_map[key_name + Constants.PM_GROUP_NAME] = resolution_pm \ No newline at end of file + super().__init__(config, options) diff --git a/Python/libraries/recognizers-date-time/recognizers_date_time/date_time/catalan/common_configs.py b/Python/libraries/recognizers-date-time/recognizers_date_time/date_time/catalan/common_configs.py index 8b0d9bcaf1..adbbc234d2 100644 --- a/Python/libraries/recognizers-date-time/recognizers_date_time/date_time/catalan/common_configs.py +++ b/Python/libraries/recognizers-date-time/recognizers_date_time/date_time/catalan/common_configs.py @@ -11,7 +11,7 @@ from ..extractors import DateTimeExtractor from ..parsers import DateTimeParser from ..base_configs import DateTimeUtilityConfiguration -from ..base_minimal_configs import MinimalBaseDateParserConfiguration +from ..base_minimal_configs import BaseMinimalDateParserConfiguration from ..base_date import BaseDateExtractor, DateExtractorConfiguration, BaseDateParser from ..base_time import BaseTimeExtractor, BaseTimeParser from ..base_timezone import BaseTimeZoneParser @@ -42,7 +42,7 @@ def extract(self, source: str, reference: datetime = None) -> List[ExtractResult return result -class CatalanCommonDateTimeParserConfiguration(MinimalBaseDateParserConfiguration): +class CatalanCommonDateTimeParserConfiguration(BaseMinimalDateParserConfiguration): @property def time_zone_parser(self) -> DateTimeParser: return self._time_zone_parser @@ -120,7 +120,7 @@ def utility_configuration(self) -> DateTimeUtilityConfiguration: return self._utility_configuration def __init__(self): - MinimalBaseDateParserConfiguration.__init__(self) + BaseMinimalDateParserConfiguration.__init__(self) self._utility_configuration = CatalanDateTimeUtilityConfiguration() self._time_zone_parser = BaseTimeZoneParser() diff --git a/Python/libraries/recognizers-date-time/recognizers_date_time/date_time/date_time_recognizer.py b/Python/libraries/recognizers-date-time/recognizers_date_time/date_time/date_time_recognizer.py index 1b7298eb99..280cfb16a1 100644 --- a/Python/libraries/recognizers-date-time/recognizers_date_time/date_time/date_time_recognizer.py +++ b/Python/libraries/recognizers-date-time/recognizers_date_time/date_time/date_time_recognizer.py @@ -1,6 +1,3 @@ -# Copyright (c) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License. - from datetime import datetime from typing import List from recognizers_text import Culture, Recognizer @@ -9,6 +6,7 @@ from .models import DateTimeModel from .base_merged import BaseMergedExtractor, BaseMergedParser from .base_minimal_merged import MinimalMergedExtractor, MinimalMergedParser +from .minimal.base_minimal_merged import BaseMinimalMergedExtractor, BaseMinimalMergedParser from .english.common_configs import EnglishCommonDateTimeParserConfiguration from .english.merged_extractor_config import EnglishMergedExtractorConfiguration from .english.merged_parser_config import EnglishMergedParserConfiguration @@ -42,6 +40,8 @@ from .catalan.common_configs import CatalanCommonDateTimeParserConfiguration from .catalan.merged_extractor_config import CatalanMergedExtractorConfiguration from .catalan.merged_parser_config import CatalanMergedParserConfiguration +from .minimal.merged_extractor_config import BaseMinimalMergedExtractorConfiguration +from .minimal.merged_parser_config import BaseMinimalMergedParserConfiguration class DateTimeRecognizer(Recognizer[DateTimeOptions]): @@ -131,6 +131,15 @@ def initialize_configuration(self): MinimalMergedExtractor(CatalanMergedExtractorConfiguration(), options) )) + self.register_model('DateTimeModel', Culture.Minimal, lambda options: DateTimeModel( + BaseMinimalMergedParser(BaseMinimalMergedParserConfiguration(), options), + BaseMinimalMergedExtractor(BaseMinimalMergedExtractorConfiguration(), options) + )) + self.register_model('DateTimeModel', Culture.MinimalOther, lambda options: DateTimeModel( + BaseMinimalMergedParser(BaseMinimalMergedParserConfiguration(dmyDateFormat=False), options), + BaseMinimalMergedExtractor(BaseMinimalMergedExtractorConfiguration(), options) + )) + def get_datetime_model(self, culture: str = None, fallback_to_default_culture: bool = True) -> Model: return self.get_model('DateTimeModel', culture, fallback_to_default_culture) diff --git a/Python/libraries/recognizers-date-time/recognizers_date_time/date_time/minimal/__init__.py b/Python/libraries/recognizers-date-time/recognizers_date_time/date_time/minimal/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/Python/libraries/recognizers-date-time/recognizers_date_time/date_time/minimal/base_configs.py b/Python/libraries/recognizers-date-time/recognizers_date_time/date_time/minimal/base_configs.py new file mode 100644 index 0000000000..aaf01c69cc --- /dev/null +++ b/Python/libraries/recognizers-date-time/recognizers_date_time/date_time/minimal/base_configs.py @@ -0,0 +1,27 @@ +from typing import Pattern +from recognizers_text.utilities import RegExpUtility +from recognizers_date_time.resources.minimal_date_time import MinimalDateTime + + +class MinimalDateTimeUtilityConfiguration: + @property + def am_desc_regex(self) -> Pattern: + return self._am_desc_regex + + @property + def pm_desc__regex(self) -> Pattern: + return self._pm_desc__regex + + @property + def am_pm_desc_regex(self) -> Pattern: + return self._am_pm_desc_regex + + def __init__(self): + + self._am_desc_regex = RegExpUtility.get_safe_reg_exp( + MinimalDateTime.AmDescRegex) + self._pm_desc__regex = RegExpUtility.get_safe_reg_exp( + MinimalDateTime.PmDescRegex) + self._am_pm_desc_regex = RegExpUtility.get_safe_reg_exp( + MinimalDateTime.AmPmDescRegex) + self._check_both_before_after = MinimalDateTime.CheckBothBeforeAfter diff --git a/Python/libraries/recognizers-date-time/recognizers_date_time/date_time/minimal/base_minimal_date.py b/Python/libraries/recognizers-date-time/recognizers_date_time/date_time/minimal/base_minimal_date.py new file mode 100644 index 0000000000..0c43a7f3ea --- /dev/null +++ b/Python/libraries/recognizers-date-time/recognizers_date_time/date_time/minimal/base_minimal_date.py @@ -0,0 +1,57 @@ +from typing import List, Optional, Dict +from datetime import datetime + +from recognizers_text.extractor import ExtractResult +from ..constants import TimeTypeConstants +from ..parsers import DateTimeParseResult +from recognizers_date_time.date_time.base_date import BaseDateExtractor, BaseDateParser + + +class BaseMinimalDateExtractor(BaseDateExtractor): + def __init__(self, config): + super().__init__(config) + + def extract(self, source: str, reference: datetime = None) -> List[ExtractResult]: + from ..utilities import merge_all_tokens + + tokens = [] + tokens.extend(self.basic_regex_match(source)) + + result = merge_all_tokens(tokens, source, self.extractor_type_name) + return result + + +class BaseMinimalDateParser(BaseDateParser): + + def __init__(self, config): + super().__init__(config) + + def parse(self, source: ExtractResult, reference: datetime = None) -> Optional[DateTimeParseResult]: + from ..utilities import DateTimeFormatUtil + if reference is None: + reference = datetime.now() + + result_value: DateTimeParseResult = None + + if source.type is self.parser_type_name: + source_text = source.text.lower() + inner_result = self.parse_basic_regex_match(source_text, reference) + + if not inner_result.success: + inner_result = self.parse_single_number(source_text, reference) + + if inner_result.success: + inner_result.future_resolution: Dict[str, str] = dict() + inner_result.future_resolution[TimeTypeConstants.DATE] = DateTimeFormatUtil.format_date( + inner_result.future_value) + inner_result.past_resolution: Dict[str, str] = dict() + inner_result.past_resolution[TimeTypeConstants.DATE] = DateTimeFormatUtil.format_date( + inner_result.past_value) + result_value = inner_result + + result = DateTimeParseResult(source) + result.value = result_value + result.timex_str = result_value.timex if result_value is not None else '' + result.resolution_str = '' + + return result diff --git a/Python/libraries/recognizers-date-time/recognizers_date_time/date_time/minimal/base_minimal_merged.py b/Python/libraries/recognizers-date-time/recognizers_date_time/date_time/minimal/base_minimal_merged.py new file mode 100644 index 0000000000..4354657089 --- /dev/null +++ b/Python/libraries/recognizers-date-time/recognizers_date_time/date_time/minimal/base_minimal_merged.py @@ -0,0 +1,68 @@ +from typing import List, Optional +from datetime import datetime + +from recognizers_text.extractor import ExtractResult +from recognizers_date_time.date_time.constants import Constants +from recognizers_date_time.date_time.utilities import DateTimeOptions +from recognizers_date_time.date_time.base_merged import BaseMergedExtractor, BaseMergedParser +from recognizers_date_time.date_time.parsers import DateTimeParseResult + + +class BaseMinimalMergedExtractor(BaseMergedExtractor): + @property + def extractor_type_name(self) -> str: + return Constants.SYS_DATETIME_MERGED + + def __init__(self, config, options: DateTimeOptions): + super().__init__(config, options) + + def extract(self, source: str, reference: datetime = None) -> List[ExtractResult]: + if reference is None: + reference = datetime.now() + + result: List[ExtractResult] = list() + + # The order is important, since there can be conflicts in merging + result = self.add_to( + result, self.config.date_extractor.extract(source, reference), source) + result = self.add_to( + result, self.config.time_extractor.extract(source, reference), source) + + result = sorted(result, key=lambda x: x.start) + + return result + + +class BaseMinimalMergedParser(BaseMergedParser): + @property + def parser_type_name(self) -> str: + return Constants.SYS_DATETIME_MERGED + + def __init__(self, config, options: DateTimeOptions): + super().__init__(config, options) + + def parse_result(self, source: ExtractResult, reference: datetime): + if source.type == Constants.SYS_DATETIME_DATE: + result = self.config.date_parser.parse(source, reference) + elif source.type == Constants.SYS_DATETIME_TIME: + result = self.config.time_parser.parse(source, reference) + else: + return None + + return result + + def parse(self, source: ExtractResult, reference: datetime = None) -> Optional[DateTimeParseResult]: + if not reference: + reference = datetime.now() + + result = self.parse_result(source, reference) + if not result: + return None + + if self.options & DateTimeOptions.SPLIT_DATE_AND_TIME and result.value and result.value.sub_date_time_entities: + result.value = self._date_time_resolution_for_split(result) + else: + result = self.set_parse_result( + result, has_before=False, has_after=False, has_since=False) + + return result diff --git a/Python/libraries/recognizers-date-time/recognizers_date_time/date_time/minimal/base_minimal_time.py b/Python/libraries/recognizers-date-time/recognizers_date_time/date_time/minimal/base_minimal_time.py new file mode 100644 index 0000000000..feae1f795a --- /dev/null +++ b/Python/libraries/recognizers-date-time/recognizers_date_time/date_time/minimal/base_minimal_time.py @@ -0,0 +1,41 @@ +from typing import List +from datetime import datetime + +from recognizers_text.extractor import ExtractResult +from recognizers_date_time.date_time.base_time import BaseTimeExtractor, BaseTimeParser +from recognizers_date_time.date_time.constants import Constants +from recognizers_date_time.date_time.utilities import DateTimeOptions, merge_all_tokens, TimeZoneUtility + + +class BaseMinimalTimeExtractor(BaseTimeExtractor): + @property + def extractor_type_name(self) -> str: + return Constants.SYS_DATETIME_TIME + + def __init__(self, config): + super().__init__(config) + + def extract(self, source: str, reference: datetime = None) -> List[ExtractResult]: + + if reference is None: + reference = datetime.now() + + tokens = self.basic_regex_match(source) + + result = merge_all_tokens(tokens, source, self.extractor_type_name) + + if (self.config.options & DateTimeOptions.ENABLE_PREVIEW) != 0: + result = TimeZoneUtility().merge_time_zones( + result, + self.config.time_zone_extractor.extract(source, reference), + source + ) + + return result + + +class BaseMinimalTimeParser(BaseTimeParser): + + def __init__(self, config): + super().__init__(config) + self.config = config diff --git a/Python/libraries/recognizers-date-time/recognizers_date_time/date_time/minimal/common_configs.py b/Python/libraries/recognizers-date-time/recognizers_date_time/date_time/minimal/common_configs.py new file mode 100644 index 0000000000..8fa87f7bd5 --- /dev/null +++ b/Python/libraries/recognizers-date-time/recognizers_date_time/date_time/minimal/common_configs.py @@ -0,0 +1,86 @@ +from typing import Dict, Pattern +from recognizers_number.number.minimal import MinimalNumberExtractor, MinimalNumberParser +from recognizers_number.number.minimal.parsers import MinimalNumberParserConfiguration +from recognizers_number.number.minimal.extractors import MinimalCardinalExtractor, MinimalIntegerExtractor + +from recognizers_date_time.resources.minimal_date_time import MinimalDateTime +from recognizers_date_time.date_time.extractors import DateTimeExtractor +from recognizers_date_time.date_time.parsers import DateTimeParser +from recognizers_date_time.date_time.base_configs import DateTimeUtilityConfiguration +from recognizers_date_time.date_time.base_minimal_configs import BaseMinimalDateParserConfiguration +from recognizers_date_time.date_time.base_timezone import BaseTimeZoneParser +from recognizers_date_time.date_time.minimal.base_configs import MinimalDateTimeUtilityConfiguration +from recognizers_date_time.date_time.minimal.date_extractor_config import MinimalDateExtractorConfiguration +from recognizers_date_time.date_time.minimal.date_parser_config import MinimalDateParserConfiguration +from recognizers_date_time.date_time.minimal.time_extractor_config import MinimalTimeExtractorConfiguration +from recognizers_date_time.date_time.minimal.time_parser_config import MinimalTimeParserConfiguration +from recognizers_date_time.date_time.minimal.base_minimal_date import BaseMinimalDateParser +from recognizers_date_time.date_time.minimal.base_minimal_time import BaseMinimalTimeExtractor, BaseMinimalTimeParser + + +class MinimalCommonDateTimeParserConfiguration: + @property + def month_of_year(self) -> Dict[str, int]: + return self._month_of_year + + @property + def time_zone_parser(self) -> DateTimeParser: + return self._time_zone_parser + + @property + def check_both_before_after(self) -> Pattern: + return self._check_both_before_after + + @property + def cardinal_extractor(self) -> MinimalNumberExtractor: + return self._cardinal_extractor + + @property + def integer_extractor(self) -> MinimalNumberExtractor: + return self._integer_extractor + + @property + def number_parser(self) -> MinimalNumberParser: + return self._number_parser + + @property + def date_extractor(self) -> DateTimeExtractor: + return self._date_extractor + + @property + def time_extractor(self) -> DateTimeExtractor: + return self._time_extractor + + @property + def date_parser(self) -> DateTimeParser: + return self._date_parser + + @property + def time_parser(self) -> DateTimeParser: + return self._time_parser + + @property + def day_of_month(self) -> Dict[str, int]: + return self._day_of_month + + @property + def utility_configuration(self) -> DateTimeUtilityConfiguration: + return self._utility_configuration + + def __init__(self, dmyDateFormat: bool = True): + BaseMinimalDateParserConfiguration.__init__(self) + + self._month_of_year = MinimalDateTime.MonthOfYear + self._utility_configuration = MinimalDateTimeUtilityConfiguration() + self._time_zone_parser = BaseTimeZoneParser() + self._check_both_before_after = MinimalDateTime.CheckBothBeforeAfter + self._cardinal_extractor = MinimalCardinalExtractor() + self._integer_extractor = MinimalIntegerExtractor() + self._number_parser = MinimalNumberParser( + MinimalNumberParserConfiguration()) + self._date_extractor = BaseMinimalDateParser( + MinimalDateExtractorConfiguration(dmyDateFormat)) + self._time_extractor = BaseMinimalTimeExtractor(MinimalTimeExtractorConfiguration()) + self._date_parser = BaseMinimalDateParser( + MinimalDateParserConfiguration(self)) + self._time_parser = BaseMinimalTimeParser(MinimalTimeParserConfiguration(self)) diff --git a/Python/libraries/recognizers-date-time/recognizers_date_time/date_time/minimal/date_extractor_config.py b/Python/libraries/recognizers-date-time/recognizers_date_time/date_time/minimal/date_extractor_config.py new file mode 100644 index 0000000000..6ebbd050bf --- /dev/null +++ b/Python/libraries/recognizers-date-time/recognizers_date_time/date_time/minimal/date_extractor_config.py @@ -0,0 +1,66 @@ +from typing import Pattern, List, Dict +from recognizers_number import (BaseNumberExtractor, MinimalNumberParser, MinimalIntegerExtractor, + MinimalNumberParserConfiguration) +from recognizers_text.utilities import RegExpUtility +from recognizers_date_time.resources.minimal_date_time import MinimalDateTime +from recognizers_date_time.resources.base_date_time import BaseDateTime + + +class MinimalDateExtractorConfiguration: + + @property + def month_of_year(self) -> Dict[str, int]: + return self._month_of_year + + @property + def check_both_before_after(self) -> bool: + return self._check_both_before_after + + @property + def date_regex_list(self) -> List[Pattern]: + return self._date_regex_list + + @property + def integer_extractor(self) -> BaseNumberExtractor: + return self._integer_extractor + + @property + def number_parser(self) -> MinimalNumberParser: + return self._number_parser + + @property + def range_connector_symbol_regex(self) -> Pattern: + return self._range_connector_symbol_regex + + @property + def strict_relative_regex(self): + return "" + + def __init__(self, dmyDateFormat: bool = True): + self._check_both_before_after = False + if dmyDateFormat: + date_extractor_4 = MinimalDateTime.DateExtractor5 + date_extractor_7 = MinimalDateTime.DateExtractor9S + date_extractor_8 = MinimalDateTime.DateExtractor4 + date_extractor_11 = MinimalDateTime.DateExtractor7S + else: + date_extractor_4 = MinimalDateTime.DateExtractor4 + date_extractor_7 = MinimalDateTime.DateExtractor7S + date_extractor_8 = MinimalDateTime.DateExtractor5 + date_extractor_11 = MinimalDateTime.DateExtractor9S + + self._date_regex_list = [ + RegExpUtility.get_safe_reg_exp(date_extractor_4), + RegExpUtility.get_safe_reg_exp(date_extractor_7), + RegExpUtility.get_safe_reg_exp(date_extractor_8), + RegExpUtility.get_safe_reg_exp(date_extractor_11), + ] + + self._integer_extractor = MinimalIntegerExtractor() + self._number_parser = MinimalNumberParser( + MinimalNumberParserConfiguration()) + self._range_connector_symbol_regex = RegExpUtility.get_safe_reg_exp( + BaseDateTime.RangeConnectorSymbolRegex + ) + self._check_both_before_after = MinimalDateTime.CheckBothBeforeAfter + self._month_of_year = MinimalDateTime.MonthOfYear diff --git a/Python/libraries/recognizers-date-time/recognizers_date_time/date_time/minimal/date_parser_config.py b/Python/libraries/recognizers-date-time/recognizers_date_time/date_time/minimal/date_parser_config.py new file mode 100644 index 0000000000..7ccc2a52a3 --- /dev/null +++ b/Python/libraries/recognizers-date-time/recognizers_date_time/date_time/minimal/date_parser_config.py @@ -0,0 +1,64 @@ +from typing import Pattern, List, Dict + +from recognizers_number import BaseNumberExtractor, BaseNumberParser +from recognizers_date_time.resources.minimal_date_time import MinimalDateTime +from recognizers_date_time.date_time.extractors import DateTimeExtractor +from recognizers_date_time.date_time.utilities import DateTimeUtilityConfiguration + + +class MinimalDateParserConfiguration: + + @property + def date_token_prefix(self) -> str: + return "" + + @property + def check_both_before_after(self) -> bool: + return self._check_both_before_after + + @property + def integer_extractor(self) -> BaseNumberExtractor: + return self._integer_extractor + + @property + def cardinal_extractor(self) -> BaseNumberExtractor: + return self._cardinal_extractor + + @property + def date_extractor(self) -> DateTimeExtractor: + return self._date_extractor + + @property + def number_parser(self) -> BaseNumberParser: + return self._number_parser + + @property + def month_of_year(self) -> Dict[str, int]: + return self._month_of_year + + @property + def day_of_month(self) -> Dict[str, int]: + return self._day_of_month + + @property + def date_regex(self) -> List[Pattern]: + return self._date_regex + + @property + def utility_configuration(self) -> DateTimeUtilityConfiguration: + return self._utility_configuration + + def __init__(self, config): + self._integer_extractor = config.integer_extractor + self._cardinal_extractor = config.cardinal_extractor + self._date_extractor = config.date_extractor + self._number_parser = config.number_parser + self._month_of_year = config.month_of_year + self._day_of_month = config.day_of_month + self._date_regex = config.date_extractor.config.date_regex_list + self._utility_configuration = config.utility_configuration + self._check_both_before_after = MinimalDateTime.CheckBothBeforeAfter + + def __normalize(self, source: str) -> str: + return source.replace('á', 'a').replace('é', 'e').replace('í', 'i').replace('ó', 'o').\ + replace('ú', 'u').replace('à', 'a') diff --git a/Python/libraries/recognizers-date-time/recognizers_date_time/date_time/minimal/merged_extractor_config.py b/Python/libraries/recognizers-date-time/recognizers_date_time/date_time/minimal/merged_extractor_config.py new file mode 100644 index 0000000000..043e1a5da6 --- /dev/null +++ b/Python/libraries/recognizers-date-time/recognizers_date_time/date_time/minimal/merged_extractor_config.py @@ -0,0 +1,44 @@ +from typing import List, Pattern + +from recognizers_text import Extractor +from recognizers_number import MinimalIntegerExtractor +from recognizers_date_time.resources.minimal_date_time import MinimalDateTime +from recognizers_date_time.date_time.extractors import DateTimeExtractor +from recognizers_date_time.date_time.minimal.base_minimal_date import BaseMinimalDateExtractor +from recognizers_date_time.date_time.minimal.base_minimal_time import BaseMinimalTimeExtractor +from recognizers_date_time.date_time.minimal.date_extractor_config import MinimalDateExtractorConfiguration +from recognizers_date_time.date_time.minimal.time_extractor_config import MinimalTimeExtractorConfiguration +from recognizers_date_time.resources.base_date_time import BaseDateTime + + +class BaseMinimalMergedExtractorConfiguration: + @property + def check_both_before_after(self): + return self._check_both_before_after + + @property + def date_extractor(self) -> DateTimeExtractor: + return self._date_extractor + + @property + def time_extractor(self) -> DateTimeExtractor: + return self._time_extractor + + @property + def integer_extractor(self) -> Extractor: + return self._integer_extractor + + @property + def equal_regex(self) -> Pattern: + return self._equal_regex + + @property + def filter_word_regex_list(self) -> List[Pattern]: + return [] + + def __init__(self): + self._date_extractor = BaseMinimalDateExtractor(MinimalDateExtractorConfiguration()) + self._time_extractor = BaseMinimalTimeExtractor(MinimalTimeExtractorConfiguration()) + self._integer_extractor = MinimalIntegerExtractor() + self._equal_regex = BaseDateTime.EqualRegex + self._check_both_before_after = MinimalDateTime.CheckBothBeforeAfter diff --git a/Python/libraries/recognizers-date-time/recognizers_date_time/date_time/minimal/merged_parser_config.py b/Python/libraries/recognizers-date-time/recognizers_date_time/date_time/minimal/merged_parser_config.py new file mode 100644 index 0000000000..e54afd2da5 --- /dev/null +++ b/Python/libraries/recognizers-date-time/recognizers_date_time/date_time/minimal/merged_parser_config.py @@ -0,0 +1,31 @@ +from typing import Pattern + +from recognizers_text.utilities import RegExpUtility +from recognizers_date_time.date_time.minimal.common_configs import MinimalCommonDateTimeParserConfiguration +from recognizers_date_time.date_time.base_date import BaseDateParser +from recognizers_date_time.date_time.base_time import BaseTimeParser +from recognizers_date_time.resources.minimal_date_time import MinimalDateTime, BaseDateTime + + +class BaseMinimalMergedParserConfiguration(MinimalCommonDateTimeParserConfiguration): + + @property + def equal_regex(self) -> Pattern: + return self._equal_regex + + @property + def year_regex(self) -> Pattern: + return self._year_regex + + @property + def date_parser(self) -> BaseDateParser: + return self._date_parser + + @property + def time_parser(self) -> BaseTimeParser: + return self._time_parser + + def __init__(self, dmyDateFormat = True): + MinimalCommonDateTimeParserConfiguration.__init__(self, dmyDateFormat) + self._equal_regex = RegExpUtility.get_safe_reg_exp(BaseDateTime.EqualRegex) + self._year_regex = RegExpUtility.get_safe_reg_exp(MinimalDateTime.YearRegex) diff --git a/Python/libraries/recognizers-date-time/recognizers_date_time/date_time/minimal/time_extractor_config.py b/Python/libraries/recognizers-date-time/recognizers_date_time/date_time/minimal/time_extractor_config.py new file mode 100644 index 0000000000..ec1e0da93a --- /dev/null +++ b/Python/libraries/recognizers-date-time/recognizers_date_time/date_time/minimal/time_extractor_config.py @@ -0,0 +1,27 @@ +from typing import List, Pattern +from recognizers_text.utilities import RegExpUtility +from ...resources.minimal_date_time import MinimalDateTime + + +class MinimalTimeExtractorConfiguration: + @property + def options(self): + return 0 + + @property + def time_regex_list(self) -> List[Pattern]: + return self._time_regex_list + + def __init__(self): + super().__init__() + self._time_regex_list: List[Pattern] = MinimalTimeExtractorConfiguration.get_time_regex_list() + + @staticmethod + def get_time_regex_list() -> List[Pattern]: + return [ + RegExpUtility.get_safe_reg_exp(MinimalDateTime.TimeRegex1), + RegExpUtility.get_safe_reg_exp(MinimalDateTime.TimeRegex2), + RegExpUtility.get_safe_reg_exp(MinimalDateTime.TimeRegex3), + RegExpUtility.get_safe_reg_exp(MinimalDateTime.TimeRegex12), + RegExpUtility.get_safe_reg_exp(MinimalDateTime.ConnectNumRegex) + ] diff --git a/Python/libraries/recognizers-date-time/recognizers_date_time/date_time/minimal/time_parser_config.py b/Python/libraries/recognizers-date-time/recognizers_date_time/date_time/minimal/time_parser_config.py new file mode 100644 index 0000000000..a7b4e333b9 --- /dev/null +++ b/Python/libraries/recognizers-date-time/recognizers_date_time/date_time/minimal/time_parser_config.py @@ -0,0 +1,38 @@ +from typing import List, Pattern, Dict + +from recognizers_text import RegExpUtility +from ..base_configs import DateTimeUtilityConfiguration +from .time_extractor_config import MinimalTimeExtractorConfiguration +from ..parsers import DateTimeParser + + +class MinimalTimeParserConfiguration: + @property + def time_token_prefix(self) -> str: + return "" + + @property + def at_regex(self) -> Pattern: + return RegExpUtility.get_safe_reg_exp(f'^[.]') + + @property + def time_regexes(self) -> List[Pattern]: + return self._time_regexes + + @property + def numbers(self) -> Dict[str, int]: + return {} + + @property + def utility_configuration(self) -> DateTimeUtilityConfiguration: + return self._utility_configuration + + @property + def time_zone_parser(self) -> DateTimeParser: + return self._time_zone_parser + + def __init__(self, config): + self._time_regexes: List[Pattern] = MinimalTimeExtractorConfiguration.get_time_regex_list() + + self._utility_configuration = config.utility_configuration + self._time_zone_parser = config.time_zone_parser diff --git a/Python/libraries/recognizers-date-time/recognizers_date_time/resources/minimal_date_time.py b/Python/libraries/recognizers-date-time/recognizers_date_time/resources/minimal_date_time.py new file mode 100644 index 0000000000..45ae94c5e7 --- /dev/null +++ b/Python/libraries/recognizers-date-time/recognizers_date_time/resources/minimal_date_time.py @@ -0,0 +1,58 @@ +from .base_date_time import BaseDateTime + + +# pylint: disable=line-too-long + + +class MinimalDateTime: + LangMarker = 'min' + CheckBothBeforeAfter = False + DayRegex = f'\\b(?01|02|03|04|05|06|07|08|09|10|11|12|13|14|15|16|17|18|19|1|20|21|22|23|24|25|26|27|28|29|2|30|31|3|4|5|6|7|8|9)(?:\\.[º°])?(?=\\b|t)' + MonthNumRegex = f'(?1[0-2]|(0)?[1-9])\\b' + AmDescRegex = f'({BaseDateTime.BaseAmDescRegex})' + PmDescRegex = f'({BaseDateTime.BasePmDescRegex})' + AmPmDescRegex = f'({BaseDateTime.BaseAmPmDescRegex})' + DescRegex = f'(?({AmDescRegex}|{PmDescRegex}))' + TwoDigitYearRegex = f'\\b(?([0-9]\\d))(?!(\\s*((\\:\\d)|{AmDescRegex}|{PmDescRegex}|\\.\\d))|\\.?[º°ª])\\b' + YearRegex = f'({BaseDateTime.FourDigitYearRegex})' + MonthNumWithYearRegex = f'\\b(({YearRegex}(\\s*?)[/\\-\\.~](\\s*?){MonthNumRegex})|({MonthNumRegex}(\\s*?)[/\\-\\.~](\\s*?){YearRegex}))\\b' + DateYearRegex = f'(?{YearRegex}|(?2[0-4]|[0-1]?\\d)' + BasicTime = f'(?{BaseDateTime.HourRegex}:{BaseDateTime.MinuteRegex}(:{BaseDateTime.SecondRegex})?|{BaseDateTime.HourRegex})' + ConnectNumRegex = f'({BaseDateTime.HourRegex}(?[0-5][0-9])\\s*{DescRegex})' + TimeRegexWithDotConnector = f'({BaseDateTime.HourRegex}\\.{BaseDateTime.MinuteRegex})' + TimeRegex1 = f'({BaseDateTime.HourRegex})\\s*({DescRegex})' + TimeRegex2 = f'(t)?{BaseDateTime.HourRegex}(\\s*)?:(\\s*)?{BaseDateTime.MinuteRegex}((\\s*)?:(\\s*)?{BaseDateTime.SecondRegex})?(\\s*({DescRegex})|\\b)' + TimeRegex3 = f'\\b({TimeRegexWithDotConnector}(\\s*({DescRegex})))' + TimeRegex12 = f'{BaseDateTime.HourRegex}(\\s*){BaseDateTime.MinuteRegex}(\\s*{DescRegex})?' + + MonthOfYear = dict([("1", 1), + ("2", 2), + ("3", 3), + ("4", 4), + ("5", 5), + ("6", 6), + ("7", 7), + ("8", 8), + ("9", 9), + ("10", 10), + ("11", 11), + ("12", 12), + ("01", 1), + ("02", 2), + ("03", 3), + ("04", 4), + ("05", 5), + ("06", 6), + ("07", 7), + ("08", 8), + ("09", 9) + ]) + + DefaultLanguageFallback = 'DMY' +# pylint: enable=line-too-long diff --git a/Python/libraries/recognizers-date-time/setup.py b/Python/libraries/recognizers-date-time/setup.py index f9851bd0fa..348b0c7f1e 100644 --- a/Python/libraries/recognizers-date-time/setup.py +++ b/Python/libraries/recognizers-date-time/setup.py @@ -10,7 +10,7 @@ def read(fname): NAME = 'recognizers-text-date-time-genesys' -VERSION = '1.1.9' +VERSION = '1.1.13a2' REQUIRES = ['recognizers-text-genesys', 'recognizers-text-number-genesys', 'recognizers-text-number-with-unit-genesys', 'regex', 'datedelta', 'python-dateutil'] diff --git a/Python/libraries/recognizers-number-with-unit/recognizers_number_with_unit/number_with_unit/minimal/__init__.py b/Python/libraries/recognizers-number-with-unit/recognizers_number_with_unit/number_with_unit/minimal/__init__.py new file mode 100644 index 0000000000..c27932f374 --- /dev/null +++ b/Python/libraries/recognizers-number-with-unit/recognizers_number_with_unit/number_with_unit/minimal/__init__.py @@ -0,0 +1,2 @@ +from .extractors import * +from .parsers import * \ No newline at end of file diff --git a/Python/libraries/recognizers-number-with-unit/recognizers_number_with_unit/number_with_unit/minimal/extractors.py b/Python/libraries/recognizers-number-with-unit/recognizers_number_with_unit/number_with_unit/minimal/extractors.py new file mode 100644 index 0000000000..475c7d351f --- /dev/null +++ b/Python/libraries/recognizers-number-with-unit/recognizers_number_with_unit/number_with_unit/minimal/extractors.py @@ -0,0 +1,85 @@ +from typing import Dict, List, Pattern + +from recognizers_text.culture import Culture +from recognizers_text.extractor import Extractor +from recognizers_text.utilities import RegExpUtility +from recognizers_number.culture import CultureInfo +from recognizers_number.number.models import NumberMode +from recognizers_number.number.minimal.extractors import MinimalNumberExtractor +from recognizers_number_with_unit.number_with_unit.constants import Constants +from recognizers_number_with_unit.number_with_unit.extractors import NumberWithUnitExtractorConfiguration +from recognizers_number_with_unit.resources.minimal_numeric_with_unit import MinimalNumericWithUnit +from recognizers_number_with_unit.resources.base_units import BaseUnits + + +class MinimalNumberWithUnitExtractorConfiguration(NumberWithUnitExtractorConfiguration): + @property + def ambiguity_filters_dict(self) -> Dict[Pattern, Pattern]: + return None + + @property + def unit_num_extractor(self) -> Extractor: + return self._unit_num_extractor + + @property + def build_prefix(self) -> str: + return self._build_prefix + + @property + def build_suffix(self) -> str: + return self._build_suffix + + @property + def connector_token(self) -> str: + return MinimalNumericWithUnit.ConnectorToken + + @property + def compound_unit_connector_regex(self) -> Pattern: + return self._compound_unit_connector_regex + + @property + def non_unit_regex(self) -> Pattern: + return self._pm_non_unit_regex + + @property + def ambiguous_unit_number_multiplier_regex(self) -> Pattern: + return None + + def expand_half_suffix(self, source, result, numbers): + pass + + def __init__(self, culture_info: CultureInfo): + if culture_info is None: + culture_info = CultureInfo(Culture.Minimal) + super().__init__(culture_info) + self._unit_num_extractor = MinimalNumberExtractor(NumberMode.Unit) + self._build_prefix = MinimalNumericWithUnit.BuildPrefix + self._build_suffix = MinimalNumericWithUnit.BuildSuffix + self._compound_unit_connector_regex = RegExpUtility.get_safe_reg_exp( + MinimalNumericWithUnit.CompoundUnitConnectorRegex) + self._pm_non_unit_regex = RegExpUtility.get_safe_reg_exp( + BaseUnits.PmNonUnitRegex) + + +class MinimalCurrencyExtractorConfiguration(MinimalNumberWithUnitExtractorConfiguration): + @property + def extract_type(self) -> str: + return Constants.SYS_UNIT_CURRENCY + + @property + def suffix_list(self) -> Dict[str, str]: + return self._suffix_list + + @property + def prefix_list(self) -> Dict[str, str]: + return self._prefix_list + + @property + def ambiguous_unit_list(self) -> List[str]: + return self._ambiguous_unit_list + + def __init__(self, culture_info: CultureInfo = None): + super().__init__(culture_info) + self._suffix_list = MinimalNumericWithUnit.CurrencySuffixList + self._prefix_list = MinimalNumericWithUnit.CurrencyPrefixList + self._ambiguous_unit_list = MinimalNumericWithUnit.AmbiguousCurrencyUnitList \ No newline at end of file diff --git a/Python/libraries/recognizers-number-with-unit/recognizers_number_with_unit/number_with_unit/minimal/parsers.py b/Python/libraries/recognizers-number-with-unit/recognizers_number_with_unit/number_with_unit/minimal/parsers.py new file mode 100644 index 0000000000..551c014749 --- /dev/null +++ b/Python/libraries/recognizers-number-with-unit/recognizers_number_with_unit/number_with_unit/minimal/parsers.py @@ -0,0 +1,42 @@ +from recognizers_text.culture import Culture +from recognizers_text.extractor import Extractor +from recognizers_text.parser import Parser +from recognizers_number.culture import CultureInfo +from recognizers_number.number.minimal.extractors import MinimalNumberExtractor, NumberMode +from recognizers_number.number.parser_factory import AgnosticNumberParserFactory, ParserType +from recognizers_number.number.minimal.parsers import MinimalNumberParserConfiguration +from recognizers_number_with_unit.number_with_unit.parsers import NumberWithUnitParserConfiguration +from recognizers_number_with_unit.resources.minimal_numeric_with_unit import MinimalNumericWithUnit + + +class MinimalNumberWithUnitParserConfiguration(NumberWithUnitParserConfiguration): + @property + def internal_number_parser(self) -> Parser: + return self._internal_number_parser + + @property + def internal_number_extractor(self) -> Extractor: + return self._internal_number_extractor + + @property + def connector_token(self) -> str: + return MinimalNumericWithUnit.ConnectorToken + + def __init__(self, culture_info: CultureInfo, decimal_point_separator=True): + if culture_info is None: + culture_info = CultureInfo(Culture.Minimal) + super().__init__(culture_info) + self._internal_number_extractor = MinimalNumberExtractor( + NumberMode.DEFAULT) + self._internal_number_parser = AgnosticNumberParserFactory.get_parser( + ParserType.NUMBER, + MinimalNumberParserConfiguration(culture_info, decimal_point_separator=decimal_point_separator)) + + +class MinimalCurrencyParserConfiguration(MinimalNumberWithUnitParserConfiguration): + def __init__(self, culture_info: CultureInfo = None, decimal_point_separator=True): + super().__init__(culture_info, decimal_point_separator) + self.add_dict_to_unit_map(MinimalNumericWithUnit.CurrencySuffixList) + self.add_dict_to_unit_map(MinimalNumericWithUnit.CurrencyPrefixList) + self.currency_name_to_iso_code_map = MinimalNumericWithUnit.CurrencyNameToIsoCodeMap + self.currency_fraction_code_list = MinimalNumericWithUnit.FractionalUnitNameToCodeMap diff --git a/Python/libraries/recognizers-number-with-unit/recognizers_number_with_unit/number_with_unit/number_with_unit_recognizer.py b/Python/libraries/recognizers-number-with-unit/recognizers_number_with_unit/number_with_unit/number_with_unit_recognizer.py index 96d0606e52..f84c97493a 100644 --- a/Python/libraries/recognizers-number-with-unit/recognizers_number_with_unit/number_with_unit/number_with_unit_recognizer.py +++ b/Python/libraries/recognizers-number-with-unit/recognizers_number_with_unit/number_with_unit/number_with_unit_recognizer.py @@ -64,6 +64,8 @@ from .catalan.parsers import CatalanCurrencyParserConfiguration from recognizers_number_with_unit.number_with_unit.arabic.extractors import ArabicCurrencyExtractorConfiguration from recognizers_number_with_unit.number_with_unit.arabic.parsers import ArabicCurrencyParserConfiguration +from recognizers_number_with_unit.number_with_unit.minimal.extractors import MinimalCurrencyExtractorConfiguration +from recognizers_number_with_unit.number_with_unit.minimal.parsers import MinimalCurrencyParserConfiguration class NumberWithUnitOptions(IntFlag): @@ -298,6 +300,21 @@ def initialize_configuration(self): ])) # endregion + # region Minimal + self.register_model('CurrencyModel', Culture.Minimal, lambda options: CurrencyModel([ + ExtractorParserModel( + BaseMergedUnitExtractor( + MinimalCurrencyExtractorConfiguration()), + BaseMergedUnitParser(MinimalCurrencyParserConfiguration())) + ])) + self.register_model('CurrencyModel', Culture.MinimalOther, lambda options: CurrencyModel([ + ExtractorParserModel( + BaseMergedUnitExtractor( + MinimalCurrencyExtractorConfiguration()), + BaseMergedUnitParser(MinimalCurrencyParserConfiguration(decimal_point_separator=False))) + ])) + # endregion + def get_age_model(self, culture: str = None, fallback_to_default_culture: bool = True) -> Model: return self.get_model('AgeModel', culture, fallback_to_default_culture) diff --git a/Python/libraries/recognizers-number-with-unit/recognizers_number_with_unit/resources/__init__.py b/Python/libraries/recognizers-number-with-unit/recognizers_number_with_unit/resources/__init__.py index 566ce494c4..4385448ab4 100644 --- a/Python/libraries/recognizers-number-with-unit/recognizers_number_with_unit/resources/__init__.py +++ b/Python/libraries/recognizers-number-with-unit/recognizers_number_with_unit/resources/__init__.py @@ -10,3 +10,4 @@ from .japanese_numeric_with_unit import JapaneseNumericWithUnit from .catalan_numeric_with_unit import CatalanNumericWithUnit from .arabic_numeric_with_unit import ArabicNumericWithUnit +from .minimal_numeric_with_unit import MinimalNumericWithUnit diff --git a/Python/libraries/recognizers-number-with-unit/recognizers_number_with_unit/resources/minimal_numeric_with_unit.py b/Python/libraries/recognizers-number-with-unit/recognizers_number_with_unit/resources/minimal_numeric_with_unit.py new file mode 100644 index 0000000000..7247a8649e --- /dev/null +++ b/Python/libraries/recognizers-number-with-unit/recognizers_number_with_unit/resources/minimal_numeric_with_unit.py @@ -0,0 +1,570 @@ +from .base_numbers import BaseNumbers +# pylint: disable=line-too-long + + +class MinimalNumericWithUnit: + CurrencySuffixList = dict([("Abkhazian apsar", "abkhazian apsar|apsars"), + ("Afghan afghani", "afghan afghani|؋|afn|afghanis|afghani"), + ("Pul", "pul"), + ("Euro", "euros|euro|€|eur"), + ("Cent", "cents|cent|-cents|-cent"), + ("Albanian lek", "albanian lek|leks|lek"), + ("Qindarkë", "qindarkë|qindarkës|qindarke|qindarkes"), + ("Angolan kwanza", "angolan kwanza|kz|aoa|kwanza|kwanzas|angolan kwanzas"), + ("Armenian dram", "armenian drams|armenian dram"), + ("Aruban florin", "aruban florin|ƒ|awg|aruban florins"), + ("Bangladeshi taka", "bangladeshi taka|৳|bdt|taka|takas|bangladeshi takas"), + ("Paisa", "poisha|paisa"), + ("Bhutanese ngultrum", "bhutanese ngultrum|nu.|btn"), + ("Chetrum", "chetrums|chetrum"), + ("Bolivian boliviano", "bolivian boliviano|bob|bs.|bolivia boliviano|bolivia bolivianos|bolivian bolivianos"), + ("Bosnia and Herzegovina convertible mark", "bosnia and herzegovina convertible mark|bam"), + ("Fening", "fenings|fenings"), + ("Botswana pula", "botswana pula|bwp|pula|pulas|botswana pulas"), + ("Thebe", "thebe"), + ("Brazilian real", "brazilian real|r$|brl|brazil real|brazil reals|brazilian reals"), + ("Bulgarian lev", "bulgarian lev|bgn|лв|bulgaria lev|bulgaria levs|bulgarian levs"), + ("Stotinka", "stotinki|stotinka"), + ("Cambodian riel", "cambodian riel|khr|៛|cambodia riel|cambodia riels|cambodian riels"), + ("Cape Verdean escudo", "cape verdean escudo|cve"), + ("Costa Rican colón", "costa rican colón|costa rican colóns|crc|₡|costa rica colón|costa rica colóns|costa rican colon|costa rican colons|costa rica colon|costa rica colons"), + ("Salvadoran colón", "svc|salvadoran colón|salvadoran colóns|salvador colón|salvador colóns|salvadoran colon|salvadoran colons|salvador colon|salvador colons"), + ("Céntimo", "céntimo"), + ("Croatian kuna", "croatian kuna|kn|hrk|croatia kuna|croatian kunas|croatian kuna kunas"), + ("Lipa", "lipa"), + ("Czech koruna", "czech koruna|czk|kč|czech korunas"), + ("Haléř", "haléř"), + ("Eritrean nakfa", "eritrean nakfa|nfk|ern|eritrean nakfas"), + ("Ethiopian birr", "ethiopian birr|etb"), + ("Gambian dalasi", "gmd"), + ("Butut", "bututs|butut"), + ("Georgian lari", "georgian lari|lari|gel|₾"), + ("Tetri", "tetri"), + ("Ghanaian cedi", "ghanaian cedi|ghs|₵|gh₵"), + ("Pesewa", "pesewas|pesewa"), + ("Guatemalan quetzal", "guatemalan quetzal|gtq|guatemala quetzal"), + ("Haitian gourde", "haitian gourde|htg"), + ("Honduran lempira", "honduran lempira|hnl"), + ("Hungarian forint", "hungarian forint|huf|ft|hungary forint|hungary forints|hungarian forints"), + ("Fillér", "fillér"), + ("Iranian rial", "iranian rial|irr|iran rial|iran rials|iranian rials"), + ("Yemeni rial", "yemeni rial|yer|yemeni rials"), + ("Israeli new shekel", "₪|ils|agora"), + ("Lithuanian litas", "ltl|lithuanian litas|lithuan litas|lithuanian lit|lithuan lit"), + ("Japanese yen", "japanese yen|jpy|yen|-yen|¥|yens|japanese yens|japan yen|japan yens"), + ("Kazakhstani tenge", "kazakhstani tenge|kazakh tenge|kazak tenge|kzt"), + ("Kenyan shilling", "kenyan shilling|kes"), + ("North Korean won", "north korean won|kpw|north korean wons"), + ("South Korean won", "south korean won|krw|south korean wons"), + ("Korean won", "korean won|₩|korean wons"), + ("Kyrgyzstani som", "kyrgyzstani som|kgs"), + ("Uzbekitan som", "uzbekitan som|uzs"), + ("Lao kip", "lao kip|lak|₭n|₭"), + ("Att", "att"), + ("Lesotho loti", "lesotho loti|lsl|loti"), + ("Sente", "sente|lisente"), + ("South African rand", "south african rand|zar|south africa rand|south africa rands|south african rands"), + ("Macanese pataca", "macanese pataca|mop$|mop"), + ("Avo", "avos|avo"), + ("Macedonian denar", "macedonian denar|mkd|ден"), + ("Deni", "deni"), + ("Malagasy ariary", "malagasy ariary|mga"), + ("Iraimbilanja", "iraimbilanja"), + ("Malawian kwacha", "malawian kwacha|mk|mwk"), + ("Tambala", "tambala"), + ("Malaysian ringgit", "malaysian ringgit|rm|myr|malaysia ringgit|malaysia ringgits|malaysian ringgits"), + ("Mauritanian ouguiya", "mauritanian ouguiya|um|mro|mauritania ouguiya|mauritania ouguiyas|mauritanian ouguiyas"), + ("Khoums", "khoums"), + ("Mongolian tögrög", "mongolian tögrög|mnt|₮|mongolia tögrög|mongolia tögrögs|mongolian tögrögs|mongolian togrog|mongolian togrogs|mongolia togrog|mongolia togrogs"), + ("Mozambican metical", "mozambican metical|mt|mzn|mozambica metical|mozambica meticals|mozambican meticals"), + ("Burmese kyat", "burmese kyat|ks|mmk"), + ("Pya", "pya"), + ("Nicaraguan córdoba", "nicaraguan córdoba|nio"), + ("Nigerian naira", "nigerian naira|naira|ngn|₦|nigeria naira|nigeria nairas|nigerian nairas"), + ("Kobo", "kobo"), + ("Turkish lira", "turkish lira|try|tl|turkey lira|turkey liras|turkish liras"), + ("Kuruş", "kuruş"), + ("Omani rial", "omani rial|omr|ر.ع."), + ("Panamanian balboa", "panamanian balboa|b/.|pab"), + ("Centesimo", "centesimo"), + ("Papua New Guinean kina", "papua new guinean kina|kina|pgk"), + ("Toea", "toea"), + ("Paraguayan guaraní", "paraguayan guaraní|₲|pyg"), + ("Peruvian sol", "peruvian sol|soles|sol|peruvian nuevo sol"), + ("Polish złoty", "złoty|polish złoty|zł|pln|zloty|polish zloty|poland zloty|poland złoty"), + ("Grosz", "groszy|grosz|grosze"), + ("Qatari riyal", "qatari riyal|qar|qatari riyals|qatar riyal|qatar riyals"), + ("Saudi riyal", "saudi riyal|sar|saudi riyals"), + ("Riyal", "riyal|riyals|rial|﷼"), + ("Dirham", "dirham|dirhem|dirhm"), + ("Halala", "halalas|halala"), + ("Samoan tālā", "samoan tālā|tālā|tala|ws$|samoa|wst|samoan tala"), + ("Sene", "sene"), + ("São Tomé and Príncipe dobra", "são tomé and príncipe dobra|dobras|dobra"), + ("Sierra Leonean leone", "sierra leonean leone|sll|leone|le"), + ("Peseta", "pesetas|peseta"), + ("Netherlands guilder", "florin|netherlands antillean guilder|ang|nederlandse gulden|guilders|guilder|gulden|-guilders|-guilder|dutch guilders|dutch guilder|fl"), + ("Swazi lilangeni", "swazi lilangeni|lilangeni|szl|emalangeni"), + ("Tajikistani somoni", "tajikistani somoni|tjs|somoni"), + ("Diram", "dirams|diram"), + ("Thai baht", "thai baht|฿|thb|baht"), + ("Satang", "satang|satangs"), + ("Tongan paʻanga", "tongan paʻanga|paʻanga|tongan pa'anga|pa'anga"), + ("Seniti", "seniti"), + ("Ukrainian hryvnia", "ukrainian hryvnia|hyrvnia|uah|₴|ukrain hryvnia|ukrain hryvnias|ukrainian hryvnias"), + ("Vanuatu vatu", "vanuatu vatu|vatu|vuv"), + ("Venezuelan bolívar", "venezuelan bolívar|venezuelan bolívars|bs.f.|vef|bolívar fuerte|venezuelan bolivar|venezuelan bolivars|venezuela bolivar|venezuela bolivarsvenezuelan bolivar|venezuelan bolivars"), + ("Vietnamese dong", "vietnamese dong|vnd|đồng|vietnam dong|vietnamese dongs|vietnam dongs"), + ("Zambian kwacha", "zambian kwacha|zk|zmw|zambia kwacha|kwachas|zambian kwachas"), + ("Moroccan dirham", "moroccan dirham|mad|د.م."), + ("United Arab Emirates dirham", "united arab emirates dirham|د.إ|aed"), + ("Azerbaijani manat", "azerbaijani manat|azn"), + ("Turkmenistan manat", "turkmenistan manat|turkmenistan new manat|tmt"), + ("Manat", "manats|manat"), + ("Qəpik", "qəpik"), + ("Somali shilling", "somali shillings|somali shilling|shilin soomaali|-shilin soomaali|scellino|shilin|sh.so.|sos"), + ("Somaliland shilling", "somaliland shillings|somaliland shilling|soomaaliland shilin"), + ("Tanzanian shilling", "tanzanian shilling|tanzanian shillings|tsh|tzs|tanzania shilling|tanzania shillings"), + ("Ugandan shilling", "ugandan shilling|ugandan shillings|ugx|uganda shilling|uganda shillings"), + ("Romanian leu", "romanian leu|lei|ron|romania leu"), + ("Moldovan leu", "moldovan leu|mdl|moldova leu"), + ("Leu", "leu"), + ("Ban", "bani|-ban|ban"), + ("Nepalese rupee", "nepalese rupees|nepalese rupee|npr"), + ("Pakistani rupee", "pakistani rupees|pakistani rupee|pkr"), + ("Indian rupee", "indian rupees|indian rupee|inr|₹|india rupees|india rupee"), + ("Seychellois rupee", "seychellois rupees|seychellois rupee|scr|sr|sre"), + ("Mauritian rupee", "mauritian rupees|mauritian rupee|mur"), + ("Maldivian rufiyaa", "maldivian rufiyaas|maldivian rufiyaa|mvr|.ރ|maldive rufiyaas|maldive rufiyaa"), + ("Sri Lankan rupee", "sri lankan rupees|sri lankan rupee|lkr|රු|ரூ"), + ("Indonesian rupiah", "indonesian rupiah|rupiah|perak|rp|idr"), + ("Rupee", "rupee|rupees|rs"), + ("Danish krone", "danish krone|dkk|denmark krone|denmark krones|danish krones"), + ("Norwegian krone", "norwegian krone|nok|norway krone|norway krones|norwegian krones"), + ("Faroese króna", "faroese króna|faroese krona"), + ("Icelandic króna", "icelandic króna|isk|icelandic krona|iceland króna|iceland krona"), + ("Swedish krona", "swedish krona|sek|swedan krona"), + ("Krone", "kronor|krona|króna|krone|krones|kr|-kr"), + ("Øre", "Øre|oyra|eyrir"), + ("West African CFA franc", "west african cfa franc|xof|west africa cfa franc|west africa franc|west african franc"), + ("Central African CFA franc", "central african cfa franc|xaf|central africa cfa franc|central african franc|central africa franc"), + ("Comorian franc", "comorian franc|kmf"), + ("Congolese franc", "congolese franc|cdf"), + ("Burundian franc", "burundian franc|bif"), + ("Djiboutian franc", "djiboutian franc|djf"), + ("CFP franc", "cfp franc|xpf"), + ("Guinean franc", "guinean franc|gnf"), + ("Swiss franc", "swiss francs|swiss franc|chf|sfr."), + ("Rwandan franc", "Rwandan franc|rwf|rf|r₣|frw"), + ("Belgian franc", "belgian franc|bi.|b.fr.|bef|belgium franc"), + ("Rappen", "rappen|-rappen"), + ("Franc", "francs|franc|fr.|fs"), + ("Centime", "centimes|centime|santim"), + ("Russian ruble", "russian ruble|₽|rub|russia ruble|russia ₽|russian ₽|russian rubles|russia rubles"), + ("New Belarusian ruble", "new belarusian ruble|byn|new belarus ruble|new belarus rubles|new belarusian rubles"), + ("Old Belarusian ruble", "old belarusian ruble|byr|old belarus ruble|old belarus rubles|old belarusian rubles"), + ("Transnistrian ruble", "transnistrian ruble|prb|р."), + ("Belarusian ruble", "belarusian ruble|belarus ruble|belarus rubles|belarusian rubles"), + ("Kopek", "kopek|kopeks"), + ("Kapyeyka", "kapyeyka"), + ("Ruble", "rubles|ruble|br"), + ("Algerian dinar", "algerian dinar|د.ج|dzd|algerian dinars|algeria dinar|algeria dinars"), + ("Bahraini dinar", "bahraini dinars|bahraini dinar|bhd|.د.ب"), + ("Santeem", "santeem|santeems"), + ("Iraqi dinar", "iraqi dinars|iraqi dinar|iraq dinars|iraq dinar|iqd|ع.د"), + ("Jordanian dinar", "jordanian dinars|jordanian dinar|د.ا|jod|jordan dinar|jordan dinars"), + ("Kuwaiti dinar", "kuwaiti dinars|kuwaiti dinar|kwd|د.ك"), + ("Libyan dinar", "libyan dinars|libyan dinar|libya dinars|libya dinar|lyd"), + ("Serbian dinar", "serbian dinars|serbian dinar|din.|rsd|дин.|serbia dinars|serbia dinar"), + ("Tunisian dinar", "tunisian dinars|tunisian dinar|tnd|tunisia dinars|tunisia dinar"), + ("Yugoslav dinar", "yugoslav dinars|yugoslav dinar|yun"), + ("Dinar", "dinars|dinar|denar|-dinars|-dinar"), + ("Fils", "fils|fulūs|-fils|-fil"), + ("Para", "para|napa"), + ("Millime", "millimes|millime"), + ("Argentine peso", "argentine peso|ars|argetina peso|argetina pesos|argentine pesos"), + ("Chilean peso", "chilean pesos|chilean peso|clp|chile peso|chile peso"), + ("Colombian peso", "colombian pesos|colombian peso|cop|colombia peso|colombia pesos"), + ("Cuban convertible peso", "cuban convertible pesos|cuban convertible peso|cuc|cuba convertible pesos|cuba convertible peso"), + ("Cuban peso", "cuban pesos|cuban peso|cup|cuba pesos|cuba peso"), + ("Dominican peso", "dominican pesos|dominican peso|dop|dominica pesos|dominica peso"), + ("Mexican peso", "mexican pesos|mexican peso|mxn|mexico pesos|mexico peso|mxn$|mxn $|mex$"), + ("Philippine peso", "piso|philippine pesos|philippine peso|₱|php"), + ("Uruguayan peso", "uruguayan pesos|uruguayan peso|uyu"), + ("Peso", "pesos|peso"), + ("Centavo", "centavos|centavo"), + ("Alderney pound", "alderney pounds|alderney pound|alderney £"), + ("British pound", "british pounds|british pound|british £|gbp|pound sterling|pound sterlings|sterling|pound scot|pound scots"), + ("Guernsey pound", "guernsey pounds|guernsey £|ggp"), + ("Ascension pound", "ascension pounds|ascension pound|ascension £"), + ("Saint Helena pound", "saint helena pounds|saint helena pound|saint helena £|shp"), + ("Egyptian pound", "egyptian pounds|egyptian pound|egyptian £|egp|ج.م|egypt pounds|egypt pound"), + ("Falkland Islands pound", "falkland islands pounds|falkland islands pound|falkland islands £|fkp|falkland island pounds|falkland island pound|falkland island £"), + ("Gibraltar pound", "gibraltar pounds|gibraltar pound|gibraltar £|gip"), + ("Manx pound", "manx pounds|manx pound|manx £|imp"), + ("Jersey pound", "jersey pounds|jersey pound|jersey £|jep"), + ("Lebanese pound", "lebanese pounds|lebanese pound|lebanese £|lebanan pounds|lebanan pound|lebanan £|lbp|ل.ل"), + ("South Georgia and the South Sandwich Islands pound", "south georgia and the south sandwich islands pounds|south georgia and the south sandwich islands pound|south georgia and the south sandwich islands £"), + ("South Sudanese pound", "south sudanese pounds|south sudanese pound|south sudanese £|ssp|south sudan pounds|south sudan pound|south sudan £"), + ("Sudanese pound", "sudanese pounds|sudanese pound|sudanese £|ج.س.|sdg|sudan pounds|sudan pound|sudan £"), + ("Syrian pound", "syrian pounds|syrian pound|syrian £|ل.س|syp|syria pounds|syria pound|syria £"), + ("Tristan da Cunha pound", "tristan da cunha pounds|tristan da cunha pound|tristan da cunha £"), + ("Pound", "pounds|pound|-pounds|-pound|£"), + ("Pence", "pence"), + ("Shilling", "shillings|shilling|shilingi|sh"), + ("Penny", "pennies|penny"), + ("United States dollar", "united states dollars|united states dollar|united states $|u.s. dollars|u.s. dollar|u s dollar|u s dollars|usd|american dollars|american dollar|us$|us dollar|us dollars|u.s dollar|u.s dollars|usd$"), + ("East Caribbean dollar", "east caribbean dollars|east caribbean dollar|east Caribbean $|xcd"), + ("Australian dollar", "australian dollars|australian dollar|australian $|australian$|aud|australia dollars|australia dollar|australia $|australia$"), + ("Bahamian dollar", "bahamian dollars|bahamian dollar|bahamian $|bahamian$|bsd|bahamia dollars|bahamia dollar|bahamia $|bahamia$"), + ("Barbadian dollar", "barbadian dollars|barbadian dollar|barbadian $|bbd"), + ("Belize dollar", "belize dollars|belize dollar|belize $|bzd"), + ("Bermudian dollar", "bermudian dollars|bermudian dollar|bermudian $|bmd|bermudia dollars|bermudia dollar|bermudia $"), + ("British Virgin Islands dollar", "british virgin islands dollars|british virgin islands dollar|british virgin islands $|bvi$|virgin islands dollars|virgin islands dolalr|virgin islands $|virgin island dollars|virgin island dollar|virgin island $"), + ("Brunei dollar", "brunei dollar|brunei $|bnd"), + ("Sen", "sen"), + ("Singapore dollar", "singapore dollars|singapore dollar|singapore $|s$|sgd"), + ("Canadian dollar", "canadian dollars|canadian dollar|canadian $|cad|can$|c$|canada dollars|canada dolllar|canada $"), + ("Cayman Islands dollar", "cayman islands dollars|cayman islands dollar|cayman islands $|kyd|ci$|cayman island dollar|cayman island doolars|cayman island $"), + ("New Zealand dollar", "new zealand dollars|new zealand dollar|new zealand $|nz$|nzd|kiwi"), + ("Cook Islands dollar", "cook islands dollars|cook islands dollar|cook islands $|cook island dollars|cook island dollar|cook island $"), + ("Fijian dollar", "fijian dollars|fijian dollar|fijian $|fjd|fiji dollars|fiji dollar|fiji $"), + ("Guyanese dollar", "guyanese dollars|guyanese dollar|gyd|gy$"), + ("Hong Kong dollar", "hong kong dollars|hong kong dollar|hong kong $|hk$|hkd|hk dollars|hk dollar|hk $|hongkong$"), + ("Jamaican dollar", "jamaican dollars|jamaican dollar|jamaican $|j$|jamaica dollars|jamaica dollar|jamaica $|jmd"), + ("Kiribati dollar", "kiribati dollars|kiribati dollar|kiribati $"), + ("Liberian dollar", "liberian dollars|liberian dollar|liberian $|liberia dollars|liberia dollar|liberia $|lrd"), + ("Micronesian dollar", "micronesian dollars|micronesian dollar|micronesian $"), + ("Namibian dollar", "namibian dollars|namibian dollar|namibian $|nad|n$|namibia dollars|namibia dollar|namibia $"), + ("Nauruan dollar", "nauruan dollars|nauruan dollar|nauruan $"), + ("Niue dollar", "niue dollars|niue dollar|niue $"), + ("Palauan dollar", "palauan dollars|palauan dollar|palauan $"), + ("Pitcairn Islands dollar", "pitcairn islands dollars|pitcairn islands dollar|pitcairn islands $|pitcairn island dollars|pitcairn island dollar|pitcairn island $"), + ("Solomon Islands dollar", "solomon islands dollars|solomon islands dollar|solomon islands $|si$|sbd|solomon island dollars|solomon island dollar|solomon island $"), + ("Surinamese dollar", "surinamese dollars|surinamese dollar|surinamese $|srd"), + ("New Taiwan dollar", "new taiwan dollars|new taiwan dollar|nt$|twd|ntd"), + ("Trinidad and Tobago dollar", "trinidad and tobago dollars|trinidad and tobago dollar|trinidad and tobago $|trinidad $|trinidad dollar|trinidad dollars|trinidadian dollar|trinidadian dollars|trinidadian $|ttd"), + ("Tuvaluan dollar", "tuvaluan dollars|tuvaluan dollar|tuvaluan $"), + ("Dollar", "dollars|dollar|$"), + ("Chinese yuan", "yuan|kuai|chinese yuan|renminbi|cny|rmb|¥|元"), + ("Fen", "fen"), + ("Jiao", "jiao|mao"), + ("Finnish markka", "suomen markka|finnish markka|finsk mark|fim|markkaa|markka"), + ("Penni", "penniä|penni"), + ("Bitcoin", "bitcoin|bitcoins|btc|xbt|₿"), + ("Millibitcoin", "millibitcoin|millibitcoins|milibitcoin|milibitcoins"), + ("Satoshi", "satoshi|satoshis")]) + CurrencyNameToIsoCodeMap = dict([("Afghan afghani", "AFN"), + ("Euro", "EUR"), + ("Albanian lek", "ALL"), + ("Angolan kwanza", "AOA"), + ("Armenian dram", "AMD"), + ("Aruban florin", "AWG"), + ("Bangladeshi taka", "BDT"), + ("Bhutanese ngultrum", "BTN"), + ("Bolivian boliviano", "BOB"), + ("Bosnia and Herzegovina convertible mark", "BAM"), + ("Botswana pula", "BWP"), + ("Brazilian real", "BRL"), + ("Bulgarian lev", "BGN"), + ("Cambodian riel", "KHR"), + ("Cape Verdean escudo", "CVE"), + ("Costa Rican colón", "CRC"), + ("Croatian kuna", "HRK"), + ("Czech koruna", "CZK"), + ("Eritrean nakfa", "ERN"), + ("Ethiopian birr", "ETB"), + ("Gambian dalasi", "GMD"), + ("Georgian lari", "GEL"), + ("Ghanaian cedi", "GHS"), + ("Guatemalan quetzal", "GTQ"), + ("Haitian gourde", "HTG"), + ("Honduran lempira", "HNL"), + ("Hungarian forint", "HUF"), + ("Iranian rial", "IRR"), + ("Yemeni rial", "YER"), + ("Israeli new shekel", "ILS"), + ("Japanese yen", "JPY"), + ("Kazakhstani tenge", "KZT"), + ("Kenyan shilling", "KES"), + ("North Korean won", "KPW"), + ("South Korean won", "KRW"), + ("Kyrgyzstani som", "KGS"), + ("Lao kip", "LAK"), + ("Lesotho loti", "LSL"), + ("South African rand", "ZAR"), + ("Macanese pataca", "MOP"), + ("Macedonian denar", "MKD"), + ("Malagasy ariary", "MGA"), + ("Malawian kwacha", "MWK"), + ("Malaysian ringgit", "MYR"), + ("Mauritanian ouguiya", "MRO"), + ("Mongolian tögrög", "MNT"), + ("Mozambican metical", "MZN"), + ("Burmese kyat", "MMK"), + ("Nicaraguan córdoba", "NIO"), + ("Nigerian naira", "NGN"), + ("Turkish lira", "TRY"), + ("Omani rial", "OMR"), + ("Panamanian balboa", "PAB"), + ("Papua New Guinean kina", "PGK"), + ("Paraguayan guaraní", "PYG"), + ("Peruvian sol", "PEN"), + ("Polish złoty", "PLN"), + ("Qatari riyal", "QAR"), + ("Saudi riyal", "SAR"), + ("Samoan tālā", "WST"), + ("São Tomé and Príncipe dobra", "STN"), + ("Sierra Leonean leone", "SLL"), + ("Swazi lilangeni", "SZL"), + ("Tajikistani somoni", "TJS"), + ("Thai baht", "THB"), + ("Ukrainian hryvnia", "UAH"), + ("Vanuatu vatu", "VUV"), + ("Venezuelan bolívar", "VEF"), + ("Zambian kwacha", "ZMW"), + ("Moroccan dirham", "MAD"), + ("United Arab Emirates dirham", "AED"), + ("Azerbaijani manat", "AZN"), + ("Turkmenistan manat", "TMT"), + ("Somali shilling", "SOS"), + ("Tanzanian shilling", "TZS"), + ("Ugandan shilling", "UGX"), + ("Romanian leu", "RON"), + ("Moldovan leu", "MDL"), + ("Nepalese rupee", "NPR"), + ("Pakistani rupee", "PKR"), + ("Indian rupee", "INR"), + ("Seychellois rupee", "SCR"), + ("Mauritian rupee", "MUR"), + ("Maldivian rufiyaa", "MVR"), + ("Sri Lankan rupee", "LKR"), + ("Indonesian rupiah", "IDR"), + ("Danish krone", "DKK"), + ("Norwegian krone", "NOK"), + ("Icelandic króna", "ISK"), + ("Swedish krona", "SEK"), + ("West African CFA franc", "XOF"), + ("Central African CFA franc", "XAF"), + ("Comorian franc", "KMF"), + ("Congolese franc", "CDF"), + ("Burundian franc", "BIF"), + ("Djiboutian franc", "DJF"), + ("CFP franc", "XPF"), + ("Guinean franc", "GNF"), + ("Swiss franc", "CHF"), + ("Rwandan franc", "RWF"), + ("Russian ruble", "RUB"), + ("Transnistrian ruble", "PRB"), + ("New Belarusian ruble", "BYN"), + ("Algerian dinar", "DZD"), + ("Bahraini dinar", "BHD"), + ("Iraqi dinar", "IQD"), + ("Jordanian dinar", "JOD"), + ("Kuwaiti dinar", "KWD"), + ("Libyan dinar", "LYD"), + ("Serbian dinar", "RSD"), + ("Tunisian dinar", "TND"), + ("Argentine peso", "ARS"), + ("Chilean peso", "CLP"), + ("Colombian peso", "COP"), + ("Cuban convertible peso", "CUC"), + ("Cuban peso", "CUP"), + ("Dominican peso", "DOP"), + ("Mexican peso", "MXN"), + ("Uruguayan peso", "UYU"), + ("British pound", "GBP"), + ("Saint Helena pound", "SHP"), + ("Egyptian pound", "EGP"), + ("Falkland Islands pound", "FKP"), + ("Gibraltar pound", "GIP"), + ("Manx pound", "IMP"), + ("Jersey pound", "JEP"), + ("Lebanese pound", "LBP"), + ("South Sudanese pound", "SSP"), + ("Sudanese pound", "SDG"), + ("Syrian pound", "SYP"), + ("United States dollar", "USD"), + ("Australian dollar", "AUD"), + ("Bahamian dollar", "BSD"), + ("Barbadian dollar", "BBD"), + ("Belize dollar", "BZD"), + ("Bermudian dollar", "BMD"), + ("Brunei dollar", "BND"), + ("Singapore dollar", "SGD"), + ("Canadian dollar", "CAD"), + ("Cayman Islands dollar", "KYD"), + ("New Zealand dollar", "NZD"), + ("Fijian dollar", "FJD"), + ("Guyanese dollar", "GYD"), + ("Hong Kong dollar", "HKD"), + ("Jamaican dollar", "JMD"), + ("Liberian dollar", "LRD"), + ("Namibian dollar", "NAD"), + ("Solomon Islands dollar", "SBD"), + ("Surinamese dollar", "SRD"), + ("New Taiwan dollar", "TWD"), + ("Trinidad and Tobago dollar", "TTD"), + ("Tuvaluan dollar", "TVD"), + ("Chinese yuan", "CNY"), + ("Rial", "__RI"), + ("Shiling", "__S"), + ("Som", "__SO"), + ("Dirham", "__DR"), + ("Dinar", "_DN"), + ("Dollar", "__D"), + ("Manat", "__MA"), + ("Rupee", "__R"), + ("Krone", "__K"), + ("Krona", "__K"), + ("Crown", "__K"), + ("Frank", "__F"), + ("Mark", "__M"), + ("Ruble", "__RB"), + ("Peso", "__PE"), + ("Pound", "__P"), + ("Tristan da Cunha pound", "_TP"), + ("South Georgia and the South Sandwich Islands pound", "_SP"), + ("Somaliland shilling", "_SS"), + ("Pitcairn Islands dollar", "_PND"), + ("Palauan dollar", "_PD"), + ("Niue dollar", "_NID"), + ("Nauruan dollar", "_ND"), + ("Micronesian dollar", "_MD"), + ("Kiribati dollar", "_KID"), + ("Guernsey pound", "_GGP"), + ("Faroese króna", "_FOK"), + ("Cook Islands dollar", "_CKD"), + ("British Virgin Islands dollar", "_BD"), + ("Ascension pound", "_AP"), + ("Alderney pound", "_ALP"), + ("Abkhazian apsar", "_AA"), + ("Bitcoin", "_XBT")]) + FractionalUnitNameToCodeMap = dict([("Jiao", "JIAO"), + ("Kopek", "KOPEK"), + ("Pul", "PUL"), + ("Cent", "CENT"), + ("Qindarkë", "QINDARKE"), + ("Penny", "PENNY"), + ("Santeem", "SANTEEM"), + ("Cêntimo", "CENTIMO"), + ("Centavo", "CENTAVO"), + ("Luma", "LUMA"), + ("Qəpik", "QƏPIK"), + ("Fils", "FILS"), + ("Poisha", "POISHA"), + ("Kapyeyka", "KAPYEYKA"), + ("Centime", "CENTIME"), + ("Chetrum", "CHETRUM"), + ("Paisa", "PAISA"), + ("Fening", "FENING"), + ("Thebe", "THEBE"), + ("Sen", "SEN"), + ("Stotinka", "STOTINKA"), + ("Fen", "FEN"), + ("Céntimo", "CENTIMO"), + ("Lipa", "LIPA"), + ("Haléř", "HALER"), + ("Øre", "ØRE"), + ("Piastre", "PIASTRE"), + ("Santim", "SANTIM"), + ("Oyra", "OYRA"), + ("Butut", "BUTUT"), + ("Tetri", "TETRI"), + ("Pesewa", "PESEWA"), + ("Fillér", "FILLER"), + ("Eyrir", "EYRIR"), + ("Dinar", "DINAR"), + ("Agora", "AGORA"), + ("Tïın", "TIIN"), + ("Chon", "CHON"), + ("Jeon", "JEON"), + ("Tyiyn", "TYIYN"), + ("Att", "ATT"), + ("Sente", "SENTE"), + ("Dirham", "DIRHAM"), + ("Rappen", "RAPPEN"), + ("Avo", "AVO"), + ("Deni", "DENI"), + ("Iraimbilanja", "IRAIMBILANJA"), + ("Tambala", "TAMBALA"), + ("Laari", "LAARI"), + ("Khoums", "KHOUMS"), + ("Ban", "BAN"), + ("Möngö", "MONGO"), + ("Pya", "PYA"), + ("Kobo", "KOBO"), + ("Kuruş", "KURUS"), + ("Baisa", "BAISA"), + ("Centésimo", "CENTESIMO"), + ("Toea", "TOEA"), + ("Sentimo", "SENTIMO"), + ("Grosz", "GROSZ"), + ("Sene", "SENE"), + ("Halala", "HALALA"), + ("Para", "PARA"), + ("Öre", "ORE"), + ("Diram", "DIRAM"), + ("Satang", "SATANG"), + ("Seniti", "SENITI"), + ("Millime", "MILLIME"), + ("Tennesi", "TENNESI"), + ("Kopiyka", "KOPIYKA"), + ("Tiyin", "TIYIN"), + ("Hào", "HAO"), + ("Ngwee", "NGWEE"), + ("Millibitcoin", "MILLIBITCOIN"), + ("Satoshi", "SATOSHI")]) + CompoundUnitConnectorRegex = f'(?and)' + MultiplierRegex = f'\\s*\\b(thousand|million|billion|trillion)s?\\b' + CurrencyPrefixList = dict([("Dobra", "db|std"), + ("Dollar", "$"), + ("Brazilian Real", "R$"), + ("United States dollar", "united states $|us$|us $|u.s. $|u.s $|usd$|usd"), + ("East Caribbean dollar", "east caribbean $|xcd"), + ("Mexican peso", "mxn$|mxn $|mex$|mxn"), + ("Australian dollar", "australian $|australia $|aud|aud$"), + ("Bahamian dollar", "bahamian $|bahamia $|bsd"), + ("Barbadian dollar", "barbadian $|barbadin $|bbd"), + ("Belize dollar", "belize $|bzd"), + ("Bermudian dollar", "bermudian $|bmd"), + ("British Virgin Islands dollar", "british virgin islands $|bvi$|virgin islands $|virgin island $|british virgin island $"), + ("Brunei dollar", "brunei $|b$|bnd"), + ("Sen", "sen"), + ("Singapore dollar", "singapore $|s$|sgd"), + ("Canadian dollar", "canadian $|can$|c$|c $|canada $|cad|cad$"), + ("Cayman Islands dollar", "cayman islands $|ci$|cayman island $|kyd"), + ("New Zealand dollar", "new zealand $|nz$|nz $|nzd|nzd$"), + ("Cook Islands dollar", "cook islands $|cook island $"), + ("Fijian dollar", "fijian $|fiji $|fjd"), + ("Guyanese dollar", "gy$|gy $|g$|g $|gyd"), + ("Hong Kong dollar", "hong kong $|hk$|hkd|hk $|hkd"), + ("Indian rupee", "₹|inr"), + ("Jamaican dollar", "jamaican $|j$|jamaica $|jmd"), + ("Kiribati dollar", "kiribati $"), + ("Liberian dollar", "liberian $|liberia $|lrd"), + ("Micronesian dollar", "micronesian $"), + ("Namibian dollar", "namibian $|nad|n$|namibia $|nad"), + ("Nauruan dollar", "nauruan $"), + ("Niue dollar", "niue $"), + ("Palauan dollar", "palauan $"), + ("Pitcairn Islands dollar", "pitcairn islands $|pitcairn island $"), + ("Solomon Islands dollar", "solomon islands $|si$|si $|solomon island $|sbd"), + ("Surinamese dollar", "surinamese $|surinam $|srd"), + ("New Taiwan dollar", "nt$|nt $|ntd|twd"), + ("Trinidad and Tobago dollar", "trinidad and tobago $|trinidad $|trinidadian $|ttd"), + ("Tuvaluan dollar", "tuvaluan $"), + ("Samoan tālā", "ws$|wst"), + ("Chinese yuan", "¥|cny|rmb"), + ("Japanese yen", "¥|jpy"), + ("Euro", "€|eur"), + ("Pound", "£|gbp"), + ("Costa Rican colón", "₡|crc"), + ("Turkish lira", "₺|try"), + ("Bitcoin", "₿|btc|xbt")]) + AmbiguousCurrencyUnitList = [r'din.', r'kiwi', r'kina', r'kobo', r'lari', r'lipa', r'napa', r'para', r'sfr.', r'taka', r'tala', r'toea', r'vatu', r'yuan', r'all', r'ang', r'ban', r'bob', r'btn', r'byr', r'cad', r'cop', r'cup', r'dop', r'gip', r'jod', r'kgs', r'lak', r'lei', r'mga', r'mop', r'nad', r'omr', r'pul', r'sar', r'sbd', r'scr', r'sdg', r'sek', r'sen', r'sol', r'sos', r'std', r'try', r'yer', r'yen', r'db', r'pen', r'ron', r'mad', r'zar', r'gel', r'satoshi', r'satoshis'] + BuildPrefix = f'(?<=(\\s|^))' + BuildSuffix = f'(?=(\\s|\\W|$))' + ConnectorToken = "and" + +# pylint: enable=line-too-long diff --git a/Python/libraries/recognizers-number-with-unit/setup.py b/Python/libraries/recognizers-number-with-unit/setup.py index 7d29dfd700..2e379aeb39 100644 --- a/Python/libraries/recognizers-number-with-unit/setup.py +++ b/Python/libraries/recognizers-number-with-unit/setup.py @@ -10,7 +10,7 @@ def read(fname): NAME = "recognizers-text-number-with-unit-genesys" -VERSION = "1.1.9" +VERSION = "1.1.13a2" REQUIRES = ['recognizers-text-genesys', 'recognizers-text-number-genesys', 'regex'] setup( diff --git a/Python/libraries/recognizers-number/recognizers_number/culture.py b/Python/libraries/recognizers-number/recognizers_number/culture.py index db303bda39..06efca5866 100644 --- a/Python/libraries/recognizers-number/recognizers_number/culture.py +++ b/Python/libraries/recognizers-number/recognizers_number/culture.py @@ -1,6 +1,3 @@ -# Copyright (c) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License. - from recognizers_text.culture import BaseCultureInfo, Culture from .number import LongFormatMode, LongFormatType @@ -16,6 +13,7 @@ Culture.Dutch: LongFormatMode.DOUBLE_DOT_COMMA, Culture.Japanese: LongFormatMode.DOUBLE_COMMA_DOT, Culture.Italian: LongFormatMode.DOUBLE_DOT_COMMA, + Culture.Minimal: LongFormatMode.DOUBLE_COMMA_DOT, } diff --git a/Python/libraries/recognizers-number/recognizers_number/number/__init__.py b/Python/libraries/recognizers-number/recognizers_number/number/__init__.py index 9315f4bb97..f407f58ab6 100644 --- a/Python/libraries/recognizers-number/recognizers_number/number/__init__.py +++ b/Python/libraries/recognizers-number/recognizers_number/number/__init__.py @@ -1,6 +1,3 @@ -# Copyright (c) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License. - from .models import * from .extractors import * from .parsers import * @@ -18,3 +15,4 @@ from .number_recognizer import * from .parser_factory import * from .utilities import * +from .minimal import * diff --git a/Python/libraries/recognizers-number/recognizers_number/number/extractors.py b/Python/libraries/recognizers-number/recognizers_number/number/extractors.py index caea762ec3..fcd2705c7f 100644 --- a/Python/libraries/recognizers-number/recognizers_number/number/extractors.py +++ b/Python/libraries/recognizers-number/recognizers_number/number/extractors.py @@ -1,5 +1,3 @@ -# Copyright (c) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License. import copy from abc import abstractmethod from typing import List, Pattern, Dict, Match @@ -9,7 +7,6 @@ from recognizers_text.utilities import RegExpUtility from recognizers_text.extractor import Extractor, ExtractResult from recognizers_number.resources.base_numbers import BaseNumbers -from recognizers_number.resources.english_numeric import EnglishNumeric from recognizers_number.number.models import LongFormatType from recognizers_number.number.constants import Constants diff --git a/Python/libraries/recognizers-number/recognizers_number/number/minimal/__init__.py b/Python/libraries/recognizers-number/recognizers_number/number/minimal/__init__.py new file mode 100644 index 0000000000..d79a5447e6 --- /dev/null +++ b/Python/libraries/recognizers-number/recognizers_number/number/minimal/__init__.py @@ -0,0 +1,2 @@ +from .extractors import * +from .parsers import * diff --git a/Python/libraries/recognizers-number/recognizers_number/number/minimal/extractors.py b/Python/libraries/recognizers-number/recognizers_number/number/minimal/extractors.py new file mode 100644 index 0000000000..84c647284a --- /dev/null +++ b/Python/libraries/recognizers-number/recognizers_number/number/minimal/extractors.py @@ -0,0 +1,192 @@ +from typing import Pattern, List, NamedTuple +from collections import namedtuple +import regex + +from recognizers_text.extractor import ExtractResult +from recognizers_number.number.models import NumberMode, LongFormatMode +from recognizers_number.resources.minimal_numeric import MinimalNumeric +from recognizers_number.number.extractors import ReVal, ReRe, BaseNumberExtractor +from recognizers_number.number.constants import Constants + +ReVal = namedtuple('ReVal', ['re', 'val']) +ReRe = namedtuple('ReRe', ['reKey', 'reVal']) +MatchesVal = namedtuple('MatchesVal', ['matches', 'val']) + + +class MinimalNumberExtractor(BaseNumberExtractor): + @property + def regexes(self) -> List[ReVal]: + return self.__regexes + + @property + def ambiguity_filters_dict(self) -> List[ReRe]: + return self.__ambiguity_filters_dict + + @property + def _extract_type(self) -> str: + return Constants.SYS_NUM + + def __init__(self, mode: NumberMode = NumberMode.DEFAULT): + self.__regexes: List[ReVal] = list() + cardinal_ex: MinimalCardinalExtractor = None + + if mode is NumberMode.PURE_NUMBER: + cardinal_ex = MinimalCardinalExtractor( + MinimalNumeric.PlaceHolderPureNumber) + elif mode is NumberMode.CURRENCY: + self.__regexes.append( + ReVal(re=MinimalNumeric.CurrencyRegex, val='IntegerNum')) + + if cardinal_ex is None: + cardinal_ex = MinimalCardinalExtractor() + + self.__regexes.extend(cardinal_ex.regexes) + + ambiguity_filters_dict: List[ReRe] = list() + + self.__ambiguity_filters_dict = ambiguity_filters_dict + + def extract(self, source: str) -> List[ExtractResult]: + if source is None or len(source.strip()) == 0: + return list() + result: List[ExtractResult] = list() + match_source = dict() + matched: List[bool] = [False] * len(source) + + matches_list = list(map( + lambda x: MatchesVal(matches=list(regex.finditer(x.re, source)), + val=x.val), self.regexes)) + matches_list = list(filter(lambda x: len(x.matches) > 0, matches_list)) + for ml in matches_list: + for m in ml.matches: + for j in range(len(m.group())): + matched[m.start() + j] = True + # Keep Source Data for extra information + match_source[m] = ml.val + + last = -1 + for i in range(len(source)): + if not matched[i]: + last = i + else: + if i + 1 == len(source) or not matched[i + 1]: + start = last + 1 + length = i - last + substr = source[start:start + length].strip() + src_match = next((x for x in iter(match_source) if ( + x.start() == start and ( + x.end() - x.start()) == length)), None) + + # extract negative numbers + if self._negative_number_terms is not None: + match = regex.search(self._negative_number_terms, + source[0:start]) + if match is not None: + start = match.start() + length = length + match.end() - match.start() + substr = source[start:start + length].strip() + + if src_match is not None: + value = ExtractResult() + value.start = start + value.length = length + value.text = substr + value.type = self._extract_type + value.data = match_source.get(src_match, None) + result.append(value) + + result = self._filter_ambiguity(result, source) + return result + + +class MinimalCardinalExtractor(BaseNumberExtractor): + @property + def regexes(self) -> List[ReVal]: + return self.__regexes + + @property + def _extract_type(self) -> str: + return Constants.SYS_NUM_CARDINAL + + def __init__(self, placeholder: str = MinimalNumeric.PlaceHolderDefault): + self.__regexes: List[ReVal] = list() + + # Add integer regexes + integer_ex = MinimalIntegerExtractor(placeholder) + self.__regexes.extend(integer_ex.regexes) + + # Add double regexes + double_ex = MinimalDoubleExtractor(placeholder) + self.__regexes.extend(double_ex.regexes) + + +class MinimalIntegerExtractor(BaseNumberExtractor): + @property + def regexes(self) -> List[ + NamedTuple('re_val', [('re', Pattern), ('val', str)])]: + return self.__regexes + + @property + def _extract_type(self) -> str: + return Constants.SYS_NUM_INTEGER + + def __init__(self, placeholder: str = MinimalNumeric.PlaceHolderDefault): + self.__regexes = [ + ReVal( + re=MinimalNumeric.NumbersWithPlaceHolder(placeholder), + val='IntegerNum'), + ReVal( + re=MinimalNumeric.NumbersWithSuffix, + val='IntegerNum'), + ReVal( + re=self._generate_format_regex(LongFormatMode.INTEGER_DOT, + placeholder), + val='IntegerNum'), + ReVal( + re=self._generate_format_regex(LongFormatMode.INTEGER_BLANK, + placeholder), + val='IntegerNum'), + ReVal( + re=self._generate_format_regex( + LongFormatMode.INTEGER_NO_BREAK_SPACE, placeholder), + val='IntegerNum') + ] + + +class MinimalDoubleExtractor(BaseNumberExtractor): + @property + def regexes(self) -> List[ + NamedTuple('re_val', [('re', Pattern), ('val', str)])]: + return self.__regexes + + @property + def _extract_type(self) -> str: + return Constants.SYS_NUM_DOUBLE + + def __init__(self, placeholder: str = MinimalNumeric.PlaceHolderDefault): + self.__regexes = [ + ReVal( + re=MinimalNumeric.DoubleDecimalPointRegex(placeholder), + val='DoubleNum'), + ReVal( + re=MinimalNumeric.DoubleWithoutIntegralRegex(placeholder), + val='DoubleNum'), + ReVal( + re=MinimalNumeric.DoubleWithMultiplierRegex, + val='DoubleNum'), + ReVal( + re=MinimalNumeric.DoubleExponentialNotationRegex, + val='DoublePow'), + ReVal( + re=MinimalNumeric.DoubleCaretExponentialNotationRegex, + val='DoublePow'), + ReVal( + re=self._generate_format_regex(LongFormatMode.DOUBLE_DOT_COMMA, + placeholder), + val='DoubleNum'), + ReVal( + re=self._generate_format_regex( + LongFormatMode.DOUBLE_NO_BREAK_SPACE_COMMA, + placeholder), + val='DoubleNum') + ] diff --git a/Python/libraries/recognizers-number/recognizers_number/number/minimal/parsers.py b/Python/libraries/recognizers-number/recognizers_number/number/minimal/parsers.py new file mode 100644 index 0000000000..5ea437a37c --- /dev/null +++ b/Python/libraries/recognizers-number/recognizers_number/number/minimal/parsers.py @@ -0,0 +1,163 @@ +from typing import Pattern, Optional, List +import regex + +from recognizers_text.culture import Culture +from recognizers_text.utilities import RegExpUtility +from recognizers_text.extractor import ExtractResult +from recognizers_text.parser import ParseResult +from recognizers_text.meta_data import MetaData +from recognizers_number.culture import CultureInfo +from recognizers_number.number.parsers import BaseNumberParserConfiguration +from recognizers_number.resources.minimal_numeric import MinimalNumeric +from recognizers_number.number.parsers import BaseNumberParser, NumberParserConfiguration + + +class MinimalNumberParserConfiguration: + + @property + def negative_number_sign_regex(self) -> Pattern: + return self._negative_number_sign_regex + + @property + def is_multi_decimal_separator_culture(self) -> bool: + return False + + @property + def culture_info(self): + return self._culture_info + + @property + def lang_marker(self) -> str: + return self._lang_marker + + @property + def non_decimal_separator_char(self) -> str: + return self._non_decimal_separator_char + + @property + def decimal_separator_char(self) -> str: + return self._decimal_separator_char + + def __init__(self, culture_info=None, decimal_point_separator: bool = True): + if culture_info is None: + culture_info = CultureInfo(Culture.Minimal) + + self._culture_info = culture_info + self._lang_marker = MinimalNumeric.LangMarker + + # Allows user to choose whether to use a ',' or '.' as a decimal separator + # decimal_point_separator=True uses a decimal point as decimal separator and comma as thousands separator + # decimal_point_separator=False uses a comma as decimal separator and decimal point as thousands separator + if decimal_point_separator: + self._decimal_separator_char = MinimalNumeric.DecimalSeparatorChar + self._non_decimal_separator_char = MinimalNumeric.NonDecimalSeparatorChar + else: + self._decimal_separator_char = MinimalNumeric.NonDecimalSeparatorChar + self._non_decimal_separator_char = MinimalNumeric.DecimalSeparatorChar + + self._negative_number_sign_regex = RegExpUtility.get_safe_reg_exp( + MinimalNumeric.NegativeNumberSignRegex) + + +class MinimalNumberParser(BaseNumberParser): + def __init__(self, config: BaseNumberParserConfiguration): + self.config: NumberParserConfiguration = config + self.supported_types: List[str] = list() + + self.arabic_number_regex: Pattern = RegExpUtility.get_safe_reg_exp( + r'\d+', flags=regex.I | regex.S) + self.round_number_set: List[str] = [] + self.is_non_standard_separator_variant = False + + def parse(self, source: ExtractResult) -> Optional[ParseResult]: + # Check if the parser is configured to support specific types + if self.supported_types and source.type not in self.supported_types: + return None + ret: Optional[ParseResult] = None + extra = source.data if isinstance(source.data, str) else None + if not extra: + if self.arabic_number_regex.search(source.text): + extra = 'Num' + else: + extra = self.config.lang_marker + + if isinstance(source.data, List): + ers = source.data + inner_prs = [self.parse(rs) for rs in ers] + merged_prs = [] + + val = 0 + count = 0 + + for idx in range(len(inner_prs)): + val += inner_prs[idx].value + if (idx + 1 >= len(inner_prs)) or not self.__is_mergeable(float(str(inner_prs[idx].value)), + float(str(inner_prs[idx + 1].value))): + start = ers[idx - count].start + length = ers[idx].start + ers[idx].length - start + + parsed_result = ParseResult() + parsed_result.start = start + parsed_result.length = length + parsed_result.value = val + parsed_result.text = source.text[start - source.start:length] + parsed_result.type = source.type + parsed_result.data = None + + merged_prs.append(parsed_result) + if val != 0: + final_val = val + val = 0 + count = 0 + + else: + count += 1 + + ret = ParseResult() + ret.start = source.start + ret.length = source.length + ret.text = source.text + ret.type = source.type + ret.value = val + final_val + ret.data = merged_prs + elif 'Num' in extra: + ret = self._digit_number_parse(source) + elif 'Pow' in extra: + ret = self._power_number_parse(source) + + if isinstance(ret.data, List): + for parsed_result in ret.data: + ret.resolution_str = self._get_resolution_string(parsed_result.value) + elif ret and ret.value is not None: + + # Use culture_info to format values + ret.resolution_str = self.config.culture_info.format( + ret.value) if self.config.culture_info is not None else repr(ret.value) + + ret.resolution_str = self._get_resolution_string(ret.value) + ret.text = ret.text.lower() + + return ret + + def _digit_number_parse(self, ext_result: ExtractResult) -> ParseResult: + result = ParseResult() + result.start = ext_result.start + result.length = ext_result.length + result.text = ext_result.text + result.type = ext_result.type + result.meta_data = MetaData() if not result.meta_data else result.meta_data + + # [1] 24 + # [2] 12 32/33 + # [3] 1,000,000 + # [4] 234.567 + # [5] 44/55 + + power = 1 + handle = ext_result.text.lower() + + # Scale used in the calculate of double + result.value = self._get_digital_value(handle, power) + + return result + diff --git a/Python/libraries/recognizers-number/recognizers_number/number/number_recognizer.py b/Python/libraries/recognizers-number/recognizers_number/number/number_recognizer.py index 204a5ac92a..38839f93ba 100644 --- a/Python/libraries/recognizers-number/recognizers_number/number/number_recognizer.py +++ b/Python/libraries/recognizers-number/recognizers_number/number/number_recognizer.py @@ -1,6 +1,3 @@ -# Copyright (c) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License. - from enum import IntFlag from typing import List @@ -32,6 +29,8 @@ from recognizers_number.number.italian.parsers import ItalianNumberParserConfiguration from recognizers_number.number.catalan.extractors import CatalanNumberExtractor, CatalanOrdinalExtractor from recognizers_number.number.catalan.parsers import CatalanNumberParserConfiguration +from recognizers_number.number.minimal.extractors import MinimalNumberExtractor +from recognizers_number.number.minimal.parsers import MinimalNumberParserConfiguration class NumberOptions(IntFlag): @@ -39,7 +38,8 @@ class NumberOptions(IntFlag): class NumberRecognizer(Recognizer[NumberOptions]): - def __init__(self, target_culture: str = None, options: NumberOptions = NumberOptions.NONE, lazy_initialization: bool = True): + def __init__(self, target_culture: str = None, options: NumberOptions = NumberOptions.NONE, + lazy_initialization: bool = True): if options < NumberOptions.NONE or options > NumberOptions.NONE: raise ValueError() super().__init__(target_culture, options, lazy_initialization) @@ -256,6 +256,18 @@ def initialize_configuration(self): )) # endregion + # region Minimal + self.register_model('NumberModel', Culture.Minimal, lambda options: NumberModel( + AgnosticNumberParserFactory.get_parser( + ParserType.NUMBER, MinimalNumberParserConfiguration()), + MinimalNumberExtractor(NumberMode.PURE_NUMBER) + )) + self.register_model('NumberModel', Culture.MinimalOther, lambda options: NumberModel( + AgnosticNumberParserFactory.get_parser( + ParserType.NUMBER, MinimalNumberParserConfiguration(decimal_point_separator=False)), + MinimalNumberExtractor(NumberMode.PURE_NUMBER) + )) + # endregion def get_number_model(self, culture: str = None, fallback_to_default_culture: bool = True) -> Model: return self.get_model('NumberModel', culture, fallback_to_default_culture) @@ -267,7 +279,8 @@ def get_percentage_model(self, culture: str = None, fallback_to_default_culture: return self.get_model('PercentModel', culture, fallback_to_default_culture) -def recognize_number(query: str, culture: str, options: NumberOptions = NumberOptions.NONE, fallback_to_default_culture: bool = True) -> List[ModelResult]: +def recognize_number(query: str, culture: str, options: NumberOptions = NumberOptions.NONE, + fallback_to_default_culture: bool = True) -> List[ModelResult]: recognizer = NumberRecognizer(culture, options) model = recognizer.get_number_model(culture, fallback_to_default_culture) return model.parse(query) diff --git a/Python/libraries/recognizers-number/recognizers_number/number/parser_factory.py b/Python/libraries/recognizers-number/recognizers_number/number/parser_factory.py index ca0e077557..f029441111 100644 --- a/Python/libraries/recognizers-number/recognizers_number/number/parser_factory.py +++ b/Python/libraries/recognizers-number/recognizers_number/number/parser_factory.py @@ -1,6 +1,3 @@ -# Copyright (c) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License. - from enum import Enum from recognizers_number.number.parsers import NumberParserConfiguration, BaseNumberParser, BasePercentageParser @@ -8,6 +5,7 @@ from recognizers_number.number.cjk_parsers import CJKNumberParser from recognizers_number.number.chinese.parsers import ChineseNumberParserConfiguration from recognizers_number.number.japanese.parsers import JapaneseNumberParserConfiguration +from recognizers_number.number.minimal.parsers import MinimalNumberParserConfiguration, MinimalNumberParser class ParserType(Enum): @@ -23,16 +21,20 @@ class ParserType(Enum): class AgnosticNumberParserFactory: @staticmethod def get_parser(parser_type: ParserType, language_config: NumberParserConfiguration) -> BaseNumberParser: - parser = BaseNumberParser(language_config) chinese = isinstance(language_config, ChineseNumberParserConfiguration) japanese = isinstance( language_config, JapaneseNumberParserConfiguration) + minimal = isinstance(language_config, MinimalNumberParserConfiguration) if chinese: parser = CJKNumberParser(language_config) elif japanese: parser = CJKNumberParser(language_config) + elif minimal: + parser = MinimalNumberParser(language_config) + else: + parser = BaseNumberParser(language_config) if parser_type is ParserType.CARDINAL: parser.supported_types = [ diff --git a/Python/libraries/recognizers-number/recognizers_number/resources/__init__.py b/Python/libraries/recognizers-number/recognizers_number/resources/__init__.py index 57de0ab4d8..017d158fed 100644 --- a/Python/libraries/recognizers-number/recognizers_number/resources/__init__.py +++ b/Python/libraries/recognizers-number/recognizers_number/resources/__init__.py @@ -11,4 +11,5 @@ from .portuguese_numeric import PortugueseNumeric from .spanish_numeric import SpanishNumeric from .japanese_numeric import JapaneseNumeric -from .catalan_numeric import CatalanNumeric \ No newline at end of file +from .catalan_numeric import CatalanNumeric +from .minimal_numeric import MinimalNumeric \ No newline at end of file diff --git a/Python/libraries/recognizers-number/recognizers_number/resources/minimal_numeric.py b/Python/libraries/recognizers-number/recognizers_number/resources/minimal_numeric.py new file mode 100644 index 0000000000..769465d053 --- /dev/null +++ b/Python/libraries/recognizers-number/recognizers_number/resources/minimal_numeric.py @@ -0,0 +1,42 @@ +# pylint: disable=line-too-long + + +class MinimalNumeric: + LangMarker = 'min' + NumberReplaceToken = '@builtin.num' + FractionNumberReplaceToken = '@builtin.num.fraction' + + def IntegerRegexDefinition(placeholder, thousandsmark): + return f'(((?