From: Nicolas Boisselier Date: Wed, 5 Jul 2023 20:33:27 +0000 (+0200) Subject: html2txt: style, script, if X-Git-Url: https://git.nbdom.net/?a=commitdiff_plain;h=a3def3f3b40fc4e6136a15335cfa73a2189fc8f4;p=nb.git html2txt: style, script, if --- diff --git a/lib/perl/NB/Functions.pm b/lib/perl/NB/Functions.pm index aa43fd9b..30bb7393 100644 --- a/lib/perl/NB/Functions.pm +++ b/lib/perl/NB/Functions.pm @@ -546,30 +546,33 @@ sub html2txt { ,\n,gix; # Delete - $v =~ s, - # NB 04.06.23: NO we want entities !!! - # NB 04.06.23 ( (<|<)/?\w+(\s+\w+=['"]+[^'"]+['"]+)*/?(>|>) ) - # NB 04.06.23 |( (<|<)?\w+(\s+\w+=\S+)*/?(>|>) ) + $v =~ s,(^_DUMMY_$) # never happend only for easy regexp change order + + |( ) # condition + + |( ]*>.*? ) # style + |( ]*>.*? ) # noscript + |( ]*>.*? ) # script - ( (<)/?\w+(\s+\w+=['"]+[^'"]+['"]+)*/?(>) ) # start tag + |( (<)/?\w+(\s+\w+=['"]+[^'"]+['"]+)*/?(>) ) # start tag |( (<)?\w+(\s+\w+=\S+)*/?(>) ) # end tag - # NB 04.06.23: NO we want entities !!! - # NB 04.06.23 |( (<|<)!\[CDATA\[ ) # cdata begin - # NB 04.06.23 | ( \]\](>|>) ) # cdata end |( ) # cdata end - # NB 04.06.23: NO we want entities !!! - # NB 04.06.23 |( (<|<)/\w+\s*$ ) # incomplet html - # NB 04.06.23 |( ^(<|<)\w+ .* ) # incomplet html |( ]+>) |([\w_-]+="[^"]+") # attrs |( <\w[^>]+> ) # tags - ,,gix; + ,,sgix; + + # NB 03.07.23: Multiple empty nosy lines + $v =~ s/\n(\s*\n)+/\n/sg; + + # Mutiple tabs + #$v =~ s/\t+/\t/g; $v =~ s/[\f ]+/ /g; &str_trim($v);