]> git.nbdom.net Git - nb.git/commitdiff
html2txt tag p for new line, tag regex containing :
authorNicolas Boisselier <nicolas.boisselier@gmail.com>
Sat, 29 Jul 2023 15:17:45 +0000 (17:17 +0200)
committerNicolas Boisselier <nicolas.boisselier@gmail.com>
Sat, 29 Jul 2023 15:17:45 +0000 (17:17 +0200)
lib/perl/NB/Functions.pm

index 6636155c52c90d6e5851df8a3ee4396e97dbf1a9..4a0058e319c837a762734bf8fe7fd230d1005282 100644 (file)
@@ -533,18 +533,6 @@ sub html2txt {
        my $v = shift @_;
 
   # New line
-       # NB 08.07.23 $v =~ s,
-               # NB 08.07.23 (
-                       # NB 08.07.23 (<|&lt;)(tr|li|)(\s+[^>])*>
-               # NB 08.07.23 )
-               # NB 08.07.23 |
-               # NB 08.07.23 (
-                       # NB 08.07.23 (<|&lt;)(br|hr)(\s*/\s*)?>
-               # NB 08.07.23 )
-               # NB 08.07.23 |
-               # NB 08.07.23 ( \\n )
-       # NB 08.07.23 ,\n,gix;
-
        $v =~ s,
                (
                        <(tr|li|)(\s+[^>])*>
@@ -556,15 +544,21 @@ sub html2txt {
                |
                ( \\n )
        ,\n,gix;
+       $v =~ s,
+       (<p[^>]*>.*?</p>)
+  ,$1\n,sgix;
 
   # Links
        $v =~ s,
        <a[^>]+href="([^"]+)"[^>]*>\g1</a>
-  ,$1,gix; # when target equal text
+  ,$1,gix; # when href equal text
        $v =~ s,
        <a[^>]+href="([^"]+)"[^>]*>([^<]+)</a>
   ,$2 ( $1 ),gix;
 
+       # Make sure that tags match the regexp \w (eg: <o:shapedefaults v:ext="...)
+  my $tag = '[\w:]';
+
   # Delete
        $v =~ s,(^_DUMMY_$) # never happend only for easy regexp change order
 
@@ -575,8 +569,8 @@ sub html2txt {
                |( <noscript[^>]*>.*?</noscript> ) # noscript
                |( <script[^>]*>.*?</script> ) # script
 
-               |( (<)/?\w+(\s+\w+=['"]+[^'"]+['"]+)*/?(>) ) # start tag
-               |( (<)?\w+(\s+\w+=\S+)*/?(>) ) # end tag
+               |( (<)/?${tag}+(\s+${tag}+=['"]+[^'"]+['"]+)*/?(>) ) # start tag
+               |( (<)?${tag}+(\s+${tag}+=\S+)*/?(>) ) # end tag
 
                |( <!\[CDATA\[ ) # cdata begin
                |( \]\]> ) # cdata end