From 8e2fa385a1a9a78ca5eea076ff79bd4259cee13b Mon Sep 17 00:00:00 2001 From: Nicolas Boisselier Date: Sun, 16 Apr 2023 23:17:07 +0200 Subject: [PATCH] html2txt(): use html_unescape on return text --- bin/html2csv | 12 ++++++++---- lib/perl/NB/Functions.pm | 3 ++- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/bin/html2csv b/bin/html2csv index 5709044e..a3367662 100755 --- a/bin/html2csv +++ b/bin/html2csv @@ -74,7 +74,8 @@ my $exp1 = $Opt{exp1}; $exp1 and $exp1 = "[^>]*$exp1"; my $exp2 = $Opt{exp2}; $exp2 and $exp2 = "[^>]*$exp2"; my $exp3 = $Opt{exp3}; $exp3 and $exp3 = "[^>]*$exp3"; -for my $table ($html =~ m,<${T1}${exp1}[^>]*>(.*?)<\s*/\s*${T1}\s*>,gi) { +for my $table ($html =~ m,<${T1}${exp1}[^>]*>(.*?)<\s*/\s*${T1}\s*>,gi) +{ $table_num++; #warn $table_num; next if defined $Opt{'num'} and $Opt{num} and $Opt{num} != $table_num; @@ -86,14 +87,17 @@ for my $table ($html =~ m,<${T1}${exp1}[^>]*>(.*?)<\s*/\s*${T1}\s*>,gi) { $tr = "<>$tr" unless $T3; my $count = 0; - for my $td ($tr =~ m,<${T3}${exp3}[^>]*>(.*?)<\s*/\s*${T3}\s*>,gi) { + for my $td ($tr =~ m,<${T3}${exp3}[^>]*>(.*?)<\s*/\s*${T3}\s*>,gi) + { $td = html2txt($td) unless $Opt{html}; $_ = chr(194).chr(160); $td =~ s/$_/ /g; $td =~ s/\s+/ /g; $td = str_trim($td); $count++; + push(@col,$td); - if (1 and $COUNT and $count > $COUNT) { + if ($COUNT and $count > $COUNT) + { print join($Opt{sep},@col)."\n"; @col = (); } @@ -201,7 +205,7 @@ __DATA__ =head1 NAME -$NAME - Script to extract html table into csv +$NAME - Script to print html table into csv =head1 SYNOPSIS diff --git a/lib/perl/NB/Functions.pm b/lib/perl/NB/Functions.pm index cd3a0a69..9e94d0da 100644 --- a/lib/perl/NB/Functions.pm +++ b/lib/perl/NB/Functions.pm @@ -529,7 +529,7 @@ return $db; sub html2txt { -my $v = shift @_; + my $v = shift @_; # New line $v =~ s, @@ -561,6 +561,7 @@ my $v = shift @_; $v =~ s/[\f ]+/ /g; &str_trim($v); + $v = &NB::Functions::html_unescape($v); return $v; -- 2.47.3