]> git.nbdom.net Git - nb.git/commitdiff
html2txt: style, script, if
authorNicolas Boisselier <nicolas.boisselier@gmail.com>
Wed, 5 Jul 2023 20:33:27 +0000 (22:33 +0200)
committerNicolas Boisselier <nicolas.boisselier@gmail.com>
Wed, 5 Jul 2023 20:33:27 +0000 (22:33 +0200)
lib/perl/NB/Functions.pm

index aa43fd9ba6f40d91acd2282f30bec02cd8802cc7..30bb739367f76aeb333f8999ba2f8a66b50b85e1 100644 (file)
@@ -546,30 +546,33 @@ sub html2txt {
        ,\n,gix;
 
   # Delete
-       $v =~ s,
-       # NB 04.06.23: NO we want entities !!!  
-               # NB 04.06.23 ( (<|&lt;)/?\w+(\s+\w+=['"]+[^'"]+['"]+)*/?(>|&gt;) )
-               # NB 04.06.23 |( (<|&lt;)?\w+(\s+\w+=\S+)*/?(>|&gt;) )
+       $v =~ s,(^_DUMMY_$) # never happend only for easy regexp change order
+
+               |( <!--\[if [^\]]+\]>.*?<!\[endif\]--> ) # condition
+
+               |( <style[^>]*>.*?</style> ) # style
+               |( <noscript[^>]*>.*?</noscript> ) # noscript
+               |( <script[^>]*>.*?</script> ) # script
 
-               ( (<)/?\w+(\s+\w+=['"]+[^'"]+['"]+)*/?(>) ) # start tag
+               |( (<)/?\w+(\s+\w+=['"]+[^'"]+['"]+)*/?(>) ) # start tag
                |( (<)?\w+(\s+\w+=\S+)*/?(>) ) # end tag
 
-       # NB 04.06.23: NO we want entities !!!  
-               # NB 04.06.23 |( (<|&lt;)!\[CDATA\[ ) # cdata begin
-               # NB 04.06.23 | ( \]\](>|&gt;) ) # cdata end
                |( <!\[CDATA\[ ) # cdata begin
                |( \]\]> ) # cdata end
 
-       # NB 04.06.23: NO we want entities !!!  
-               # NB 04.06.23 |( (<|&lt;)/\w+\s*$ ) # incomplet html
-               # NB 04.06.23 |( ^(<|&lt;)\w+ .* ) # incomplet html
                |( </\w+\s*$ ) # incomplet html
                |( ^<\w+ .* ) # incomplet html
 
     |(<\!DOCTYPE[^>]+>)
     |([\w_-]+="[^"]+") # attrs
                |( <\w[^>]+> ) # tags
-       ,,gix;
+       ,,sgix;
+
+       # NB 03.07.23: Multiple empty nosy lines 
+       $v =~ s/\n(\s*\n)+/\n/sg;
+
+       # Mutiple tabs
+       #$v =~ s/\t+/\t/g;
 
        $v =~ s/[\f ]+/ /g;
        &str_trim($v);