From db9672b72fd35fcdd4cf38a88c5dbd7ae2e08631 Mon Sep 17 00:00:00 2001 From: "E.Smith" <31170571+azlm8t@users.noreply.github.com> Date: Thu, 19 Oct 2017 17:32:43 +0100 Subject: [PATCH] eit: Scrape genre from text in OTA EIT. (#4509). Some broadcasters do not set the genre (dvb content descriptor). So we allow the user to specify a list of regex that will then match the genre from text. Obviously this is not as good as having the genre broadcast. This is configurable in the scraper settings. For example if we have "Detective" in the text then we can check the genre in "EN 300 468" to see this should be category 0x11. So we add a line in the scrape configuration file in data/conf/epggrab/eit/scrape/XX of the format: "genre_11": [ "(Detective)" ] We can add multiple regex such as: "genre_55": [ "(Muppets)", "(Sesame)" ] or combine them: "genre_55": [ "(Muppets|Sesame)" ] Issue: #4509 --- src/epggrab.h | 1 + src/epggrab/module.c | 17 +++++++++++ src/epggrab/module/eit.c | 73 ++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 91 insertions(+) diff --git a/src/epggrab.h b/src/epggrab.h index 7944794a4..fd39f5998 100644 --- a/src/epggrab.h +++ b/src/epggrab.h @@ -273,6 +273,7 @@ struct epggrab_module_ota_scraper char *scrape_config; ///< Config to use or blank/NULL for default. int scrape_episode; ///< Scrape season/episode from EIT summary int scrape_subtitle;///< Scrape subtitle from EIT summary + int scrape_genre; ///< Scrape genre from EIT text fields }; /* diff --git a/src/epggrab/module.c b/src/epggrab/module.c index fc9fa1e3e..e310ff981 100644 --- a/src/epggrab/module.c +++ b/src/epggrab/module.c @@ -312,6 +312,23 @@ const idclass_t epggrab_mod_ota_scraper_class = { .off = offsetof(epggrab_module_ota_scraper_t, scrape_subtitle), .group = 2, }, + { + .type = PT_BOOL, + .id = "scrape_genre", + .name = N_("Scrape Genre"), + .desc = N_("Enable/disable scraping genre from the programme text fields. " + "Some broadcasters do not send genre information. " + "This allows scraping of genre " + "from within the broadcast text fields if supported by the " + "configuration file. " + "This is less accurate than information a broadcaster could provide " + "but is useful when the information is not provided. " + "Broadcasters that provide DVB genre information do not require " + "this option to be enabled." + ), + .off = offsetof(epggrab_module_ota_scraper_t, scrape_genre), + .group = 2, + }, {} } }; diff --git a/src/epggrab/module/eit.c b/src/epggrab/module/eit.c index 9add68ff9..a2a3f5084 100644 --- a/src/epggrab/module/eit.c +++ b/src/epggrab/module/eit.c @@ -46,6 +46,11 @@ typedef struct eit_private #define EIT_SPEC_NZ_FREEVIEW 2 #define EIT_SPEC_UK_CABLE_VIRGIN 3 +typedef struct eit_genre_regex +{ + uint8_t genre; ///< Genre code from epg.c + eit_pattern_list_t p_genre; ///< Regex across fields to match this genre. +} eit_genre_regex_t; /* Provider configuration */ typedef struct eit_module_t @@ -55,6 +60,8 @@ typedef struct eit_module_t eit_pattern_list_t p_enum; eit_pattern_list_t p_airdate; ///< Original air date parser eit_pattern_list_t p_scrape_subtitle;///< Scrape subtitle from summary data + int num_eit_genre_regex; + eit_genre_regex_t *eit_genres; } eit_module_t; /* ************************************************************************ @@ -483,6 +490,29 @@ _eit_scrape_episode(const char *str, return changed; } +/* Genre is handle differently to others in that we build + * up lists of genres in the event and then afterwards if the + * list exists we then see if the entire list has changed. + */ +static void +_eit_scrape_genre(const char *str, + eit_module_t *eit_mod, + eit_event_t *ev) +{ + if (!str || !*str) return; + if (!eit_mod->num_eit_genre_regex) return; + + char buffer[2048]; + int i = 0; + for (; i < eit_mod->num_eit_genre_regex; ++i) { + eit_genre_regex_t *egr = &eit_mod->eit_genres[i]; + if (eit_pattern_apply_list(buffer, sizeof(buffer), str, &egr->p_genre)) { + if (!ev->genre) ev->genre = calloc(1, sizeof(epg_genre_list_t)); + epg_genre_list_add_by_eit(ev->genre, egr->genre); + } + } +} + /* ************************************************************************ * EIT Event @@ -678,6 +708,22 @@ static int _eit_process_event_one eit_mod, &en, &first_aired); } + if (eit_mod->scrape_genre) { + /* Genre scraping builds up a list in ev.genre so has no + * "scraped" value here to check. + */ + if (ev.title) + _eit_scrape_genre(lang_str_get(ev.title, ev.default_charset), + eit_mod, &ev); + if (ev.desc) + _eit_scrape_genre(lang_str_get(ev.desc, ev.default_charset), + eit_mod, &ev); + + if (ev.summary) + _eit_scrape_genre(lang_str_get(ev.summary, ev.default_charset), + eit_mod, &ev); + } + /* Update Episode */ if (ee) { *save |= epg_broadcast_set_episode(ebc, ee, &changes2); @@ -1062,6 +1108,30 @@ static void _eit_scrape_clear(eit_module_t *mod) eit_pattern_free_list(&mod->p_enum); eit_pattern_free_list(&mod->p_airdate); eit_pattern_free_list(&mod->p_scrape_subtitle); + mod->num_eit_genre_regex = 0; + free(mod->eit_genres); +} + +static void _eit_scrape_load_genre_regex(htsmsg_t *m, eit_module_t *mod) +{ + htsmsg_field_t *f; + HTSMSG_FOREACH(f, m) { + htsmsg_t *value; + if (!strncmp(f->hmf_name, "genre_", 6) && (value = htsmsg_get_list_by_field(f))) { + int genre_int; + if (sscanf(f->hmf_name, "genre_%x", &genre_int) == 1) { + /* One item parsed, this is the hex category code */ + if (genre_int < 0 || genre_int > 255) + continue; + ++mod->num_eit_genre_regex; + mod->eit_genres = realloc(mod->eit_genres, + mod->num_eit_genre_regex * sizeof(eit_genre_regex_t)); + eit_genre_regex_t *egr = &mod->eit_genres[mod->num_eit_genre_regex - 1]; + egr->genre = genre_int; + eit_pattern_compile_list(&egr->p_genre, value); + } + } + } } static int _eit_scrape_load_one ( htsmsg_t *m, eit_module_t* mod ) @@ -1076,6 +1146,9 @@ static int _eit_scrape_load_one ( htsmsg_t *m, eit_module_t* mod ) eit_pattern_compile_list(&mod->p_scrape_subtitle, htsmsg_get_list(m, "scrape_subtitle")); } + if (mod->scrape_genre) { + _eit_scrape_load_genre_regex(m, mod); + } return 1; } base-commit: cdd35c8cdf670738e1c94ac158583caf99b283e7 -- 2.14.1