From 5528f7cc5673678a1e374205121212f698e97bd4 Mon Sep 17 00:00:00 2001 From: "E.Smith" <31170571+azlm8t@users.noreply.github.com> Date: Thu, 19 Oct 2017 17:32:43 +0100 Subject: [PATCH] eit: Scrape genre from text in OTA EIT. (#4509). Some broadcasters do not set the genre (dvb content descriptor) or send poor genre information such as only major categories. So we allow the user to specify a list of regex that will then match the genre from text. This is not as good as having accurate genre information broadcast, but can compensate for broadcasters. This is configurable in the scraper settings. For example if we have "Detective" in the text then we can check the genre in "EN 300 468" to see this should be category 0x11 with text "Detective / Thriller" (spacing is important). So we add a line in the scrape configuration file in data/conf/epggrab/eit/scrape/XX of the format: "Detective / Thriller": [ "(Detective)" ] The regular expression must contain a grouping operator (brackets), but the exact text returned is unimportant. As long as a match occurs then the programme is assigned the genre, which in the above example will be Detective / Thriller. We can add multiple regex such as: "Cartoons / Puppets": [ "(Muppets)", "(Sesame)" ] or combine them: "Cartoons / Puppets": [ "(Muppets|Sesame)" ] So a final example would be something like this: "genre" : [ { "Romance" : ["(Romcom)"], "Detective / Thriller": [ "(Detective)" ] "Cartoons / Puppets": [ "(Muppets|Sesame)" ], "Sports" : ["^(Snooker)"] }] Care should be taken to ensure the file is valid. The tools "jq < config_file" or "json_pp < config_file" can be used to parse the file outside of Tvheadend to detect issues and the logfile can be checked for lines of the format: "Module eit - Scrape "Detective / Thriller" to genre 0x11" Issue: #4509. --- data/conf/epggrab/eit/scrape/Bulsatcom_39E | 29 ++++++++++ data/conf/epggrab/eit/scrape/uk | 25 ++++++++ src/epg.c | 4 +- src/epg.h | 2 +- src/epggrab.h | 1 + src/epggrab/module.c | 19 ++++++ src/epggrab/module/eit.c | 93 +++++++++++++++++++++++++++++- src/webui/static/app/tvheadend.js | 1 + 8 files changed, 170 insertions(+), 4 deletions(-) create mode 100644 data/conf/epggrab/eit/scrape/Bulsatcom_39E diff --git a/data/conf/epggrab/eit/scrape/Bulsatcom_39E b/data/conf/epggrab/eit/scrape/Bulsatcom_39E new file mode 100644 index 000000000..d38dbf3e7 --- /dev/null +++ b/data/conf/epggrab/eit/scrape/Bulsatcom_39E @@ -0,0 +1,29 @@ +{ + "season_num": [ + "сезон ([0-9]+)", + "[, ] сезон ([0-9]+)", + "сез.? ([0-9]+)", + "[, ] с. ([0-9]+)", + "с. ([0-9]+), еп[.]", + "с. ([0-9]+)", + "еп. [0-9]+,.*, ([0-9]+), ?сез" + ], + "episode_num": [ + "([0-9]+) серия", + "еп. ([0-9]+)", + "[, ] ([0-9]+) еп[.]", + "([0-9]+) еп.[,]", + "епизод ([0-9]+)", + "Епизод ([0-9]+)", + "[, ] ([0-9]+) епизод", + "([0-9]+) епизод" + ], + "airdate": [ + ", ([0-9][0-9][0-9][0-9])" + ], + "genre" : [ { + "Romance": ["(драма, романтичен)"], + "Documentary": ["(документален)"] + } + ] +} diff --git a/data/conf/epggrab/eit/scrape/uk b/data/conf/epggrab/eit/scrape/uk index f7b383db0..bac2e8621 100644 --- a/data/conf/epggrab/eit/scrape/uk +++ b/data/conf/epggrab/eit/scrape/uk @@ -24,5 +24,30 @@ ], "is_new" : [ "^(New: )" + ], + "genre" : [ { + "Movie / Drama": ["(Movie|Film)"], + "Detective / Thriller" : ["(Murder mystery|thriller|sleuth|detective|Miss Marple|Poirot|Agatha Christie|^Columbo)"], + "Adventure / Western / War" : ["(Action adventure|wartime)", "^(Action|Western)"], + "Science fiction / Fantasy / Horror" : ["^(Sci-fi)", "^(Horror)", "(superhero fantasy)"], + "Comedy" : ["(Comedy-drama| comedy|action adventure|^Comedy)"], + "Romance" : ["(Romcom)"], + "Adult movie / Drama" : ["(18[+])", "(Adults only)", "(Very strong language|Extreme violence)"], + "News / Current affairs" : ["(BBC News|ITV News|Sky News)"], + "News / Weather report": ["(Followed by [Ww]eather|weather forecast|Shipping Forecast)"], + "Documentary" : ["(Documentary series)"], + "Talk show" : ["(chats about)"], + "Sports" : ["^(Snooker)"], + "Football / Soccer" : ["^(Football|Match of the)", "(NFL|Premier League)"], + "Team sports (excluding football)" : ["^(Rugby)"], + "Equestrian" : ["Racing.*(Ascot|Cheltenham)"], + "Children's / Youth programs" : ["(Family animation|Children's comedy)"], + "Cartoons / Puppets" : ["(Family animation)"], + "Music / Ballet / Dance" : ["(Dancing)"], + "Nature / Animals / Environment" : ["(Attenborough)"], + "Social / Political issues / Economics" : ["( politics)", "(Mayor's )?Question Time", "House of (Lords|Commons)", "Welsh Assembly|in Parliament" ], + "Advertisement / Shopping" : ["(Auction|Teleshopping)"], + "Cooking" : ["(cooks up|whips up)"] + } ] } diff --git a/src/epg.c b/src/epg.c index e4ce0d3c2..0af2969c6 100644 --- a/src/epg.c +++ b/src/epg.c @@ -2713,7 +2713,7 @@ static int _genre_str_match ( const char *a, const char *b ) return (*a == '\0' && *b == '\0'); // end of string(both) } -static uint8_t _epg_genre_find_by_name ( const char *name, const char *lang ) +uint8_t epg_genre_find_by_name ( const char *name, const char *lang ) { uint8_t a, b; const char *s; @@ -2812,7 +2812,7 @@ int epg_genre_list_add_by_eit ( epg_genre_list_t *list, uint8_t eit ) int epg_genre_list_add_by_str ( epg_genre_list_t *list, const char *str, const char *lang ) { epg_genre_t g; - g.code = _epg_genre_find_by_name(str, lang); + g.code = epg_genre_find_by_name(str, lang); return epg_genre_list_add(list, &g); } diff --git a/src/epg.h b/src/epg.h index da2de66a4..a9281ee48 100644 --- a/src/epg.h +++ b/src/epg.h @@ -97,7 +97,7 @@ int epg_genre_list_add_by_str ( epg_genre_list_t *list, const char *str, const c /* Search */ int epg_genre_list_contains ( epg_genre_list_t *list, epg_genre_t *genre, int partial ); - +uint8_t epg_genre_find_by_name ( const char *name, const char *lang ); /* List all available genres */ htsmsg_t *epg_genres_list_all ( int major_only, int major_prefix, const char *lang ); diff --git a/src/epggrab.h b/src/epggrab.h index 7944794a4..fd39f5998 100644 --- a/src/epggrab.h +++ b/src/epggrab.h @@ -273,6 +273,7 @@ struct epggrab_module_ota_scraper char *scrape_config; ///< Config to use or blank/NULL for default. int scrape_episode; ///< Scrape season/episode from EIT summary int scrape_subtitle;///< Scrape subtitle from EIT summary + int scrape_genre; ///< Scrape genre from EIT text fields }; /* diff --git a/src/epggrab/module.c b/src/epggrab/module.c index d3a5a6df4..673353c45 100644 --- a/src/epggrab/module.c +++ b/src/epggrab/module.c @@ -312,6 +312,25 @@ const idclass_t epggrab_mod_ota_scraper_class = { .off = offsetof(epggrab_module_ota_scraper_t, scrape_subtitle), .group = 2, }, + { + .type = PT_BOOL, + .id = "scrape_genre", + .name = N_("Scrape Genre"), + .desc = N_("Enable/disable scraping genre from the programme text fields. " + "Some broadcasters do not send genre information or " + "send inadequate genre information. " + "This allows scraping of genre " + "from within the broadcast text fields if supported by the " + "configuration file. " + "This is less accurate than information a broadcaster could provide " + "but is useful when the information is not provided or is poor. " + "Broadcasters that provide DVB genre information do not require " + "this option to be enabled but may gain additional genres by " + "enabling it. For example, UK users benefit from enabling this." + ), + .off = offsetof(epggrab_module_ota_scraper_t, scrape_genre), + .group = 2, + }, {} } }; diff --git a/src/epggrab/module/eit.c b/src/epggrab/module/eit.c index cdd3d4da6..76319b60f 100644 --- a/src/epggrab/module/eit.c +++ b/src/epggrab/module/eit.c @@ -46,6 +46,11 @@ typedef struct eit_private #define EIT_SPEC_NZ_FREEVIEW 2 #define EIT_SPEC_UK_CABLE_VIRGIN 3 +typedef struct eit_genre_regex +{ + uint8_t genre; ///< Genre code from epg.c + eit_pattern_list_t p_genre; ///< Regex across fields to match this genre. +} eit_genre_regex_t; /* Provider configuration */ typedef struct eit_module_t @@ -56,6 +61,8 @@ typedef struct eit_module_t eit_pattern_list_t p_airdate; ///< Original air date parser eit_pattern_list_t p_scrape_subtitle;///< Scrape subtitle from summary data eit_pattern_list_t p_is_new; ///< Is programme new to air + int num_eit_genre_regex; + eit_genre_regex_t *eit_genres; } eit_module_t; /* ************************************************************************ @@ -486,6 +493,31 @@ _eit_scrape_episode(const char *str, return changed; } +/* Genre is handle differently to others in that we build + * up lists of genres in the event and then afterwards if the + * list exists we then see if the entire list has changed. + */ +static void +_eit_scrape_genre(const char *str, + eit_module_t *eit_mod, + eit_event_t *ev) +{ + char buffer[2048]; + int i = 0; + + if (!str || !*str) return; + if (!eit_mod->num_eit_genre_regex) return; + + for (; i < eit_mod->num_eit_genre_regex; ++i) { + eit_genre_regex_t *egr = &eit_mod->eit_genres[i]; + if (eit_pattern_apply_list(buffer, sizeof(buffer), str, &egr->p_genre)) { + /* Free'd by caller */ + if (!ev->genre) ev->genre = calloc(1, sizeof(epg_genre_list_t)); + epg_genre_list_add_by_eit(ev->genre, egr->genre); + } + } +} + /* ************************************************************************ * EIT Event @@ -683,6 +715,22 @@ static int _eit_process_event_one eit_mod, &en, ©right_year, &is_new); } + if (eit_mod->scrape_genre) { + /* Genre scraping builds up a list in ev.genre so has no + * "scraped" value here to check. + */ + if (ev.title) + _eit_scrape_genre(lang_str_get(ev.title, ev.default_charset), + eit_mod, &ev); + if (ev.desc) + _eit_scrape_genre(lang_str_get(ev.desc, ev.default_charset), + eit_mod, &ev); + + if (ev.summary) + _eit_scrape_genre(lang_str_get(ev.summary, ev.default_charset), + eit_mod, &ev); + } + /* Update Episode */ if (ee) { *save |= epg_broadcast_set_episode(ebc, ee, &changes2); @@ -1080,6 +1128,46 @@ static void _eit_scrape_clear(eit_module_t *mod) eit_pattern_free_list(&mod->p_airdate); eit_pattern_free_list(&mod->p_scrape_subtitle); eit_pattern_free_list(&mod->p_is_new); + mod->num_eit_genre_regex = 0; + free(mod->eit_genres); +} + +/// Convert a message containing an array of genre names to regex matches +/// in to internal format for EPG mapping. +/// For example: [ { "Documentary" : ["(Documentary series)"] }] becomes +/// epg_genre 23 --> regex +static void _eit_scrape_load_one_genre_regex(htsmsg_t *m, eit_module_t *mod) +{ + htsmsg_field_t *f; + if (!m) + return; + HTSMSG_FOREACH(f, m) { + htsmsg_t *value = htsmsg_get_list_by_field(f); + if (value && f->hmf_name && *f->hmf_name) { + const uint8_t genre_int = epg_genre_find_by_name(f->hmf_name, NULL); + if (genre_int) { + ++mod->num_eit_genre_regex; + mod->eit_genres = realloc(mod->eit_genres, + mod->num_eit_genre_regex * sizeof(eit_genre_regex_t)); + eit_genre_regex_t *egr = &mod->eit_genres[mod->num_eit_genre_regex - 1]; + egr->genre = genre_int; + eit_pattern_compile_list(&egr->p_genre, value); + tvhinfo(LS_TBL_EIT, "module %s - Scrape \"%s\" to genre 0x%x", mod->id, f->hmf_name, genre_int); + } + } + } +} + + +static void _eit_scrape_load_genre_regex(htsmsg_t *m, eit_module_t *mod) +{ + htsmsg_field_t *f; + if (!m) + return; + HTSMSG_FOREACH(f, m) { + htsmsg_t *value = htsmsg_get_map_by_field(f); + _eit_scrape_load_one_genre_regex(value, mod); + } } static int _eit_scrape_load_one ( htsmsg_t *m, eit_module_t* mod ) @@ -1095,12 +1183,15 @@ static int _eit_scrape_load_one ( htsmsg_t *m, eit_module_t* mod ) eit_pattern_compile_list(&mod->p_scrape_subtitle, htsmsg_get_list(m, "scrape_subtitle")); } + if (mod->scrape_genre) { + _eit_scrape_load_genre_regex(htsmsg_get_list(m, "genre"), mod); + } return 1; } static void _eit_module_load_config(eit_module_t *mod) { - if (!mod->scrape_episode && !mod->scrape_subtitle) { + if (!mod->scrape_episode && !mod->scrape_subtitle && !mod->scrape_genre) { tvhinfo(LS_TBL_EIT, "module %s - scraper disabled by config", mod->id); return; } diff --git a/src/webui/static/app/tvheadend.js b/src/webui/static/app/tvheadend.js index 506087c3d..d706a7333 100644 --- a/src/webui/static/app/tvheadend.js +++ b/src/webui/static/app/tvheadend.js @@ -166,6 +166,7 @@ var genre_minor = { "15" : "couch_and_lamp", "16" : "red_heart", "18" : "no_one_under_eighteen_symbol", + "21" : "sun_behind_cloud", "24" : "speaking_head_in_silhouette", "33" : "speaking_head_in_silhouette", "43" : "soccer_ball", -- 2.14.1