From 6caf2b8c68efbdced21a45b8aef8220ac2e56fb3 Mon Sep 17 00:00:00 2001 From: Nicolas Boisselier Date: Mon, 23 Sep 2024 10:47:47 +0200 Subject: [PATCH] --nohref --- bin/html2csv | 133 ++++++++++++++++++++++++++++----------------------- 1 file changed, 73 insertions(+), 60 deletions(-) diff --git a/bin/html2csv b/bin/html2csv index 8e190bcc..bf0e6a94 100755 --- a/bin/html2csv +++ b/bin/html2csv @@ -28,14 +28,14 @@ my $VERBOSE = $main::VERBOSE = 1; my $DEBUG = $main::DEBUG = 0; my %Opt = ( - 'num' => undef, - 'print-num' => 0, - 'tag' => 'table', - 'exp1'=> '', - 'exp2'=> '', - 'exp3'=> '', - 'sep' => "\t", - 'html' => 0, + 'num' => undef, + 'print-num' => 0, + 'tag' => 'table', + 'exp1'=> '', + 'exp2'=> '', + 'exp3'=> '', + 'sep' => "\t", + 'html' => 0, ); get_options(\%Opt); # NB 15.05.23 help() unless @ARGV; @@ -53,17 +53,17 @@ $_ = $Opt{sep}; $html =~ s/$_/ /g; $html =~ s/\s+/ /g; my %TAGS = ( - 'table' => [ - 'table', - 'tr', - 't[dh]', - ], - 'dl' => [ - 'dl', - '', - 'd[td]', - 2, - ], + 'table' => [ + 'table', + 'tr', + 't[dh]', + ], + 'dl' => [ + 'dl', + '', + 'd[td]', + 2, + ], ); exists $TAGS{$Opt{tag}} or die "$NAME: Unknown tag '$Opt{tag}'\n"; my ($T1,$T2,$T3,$COUNT) = @{ $TAGS{$Opt{tag}} }; @@ -76,47 +76,59 @@ my $exp3 = $Opt{exp3}; $exp3 and $exp3 = "[^>]*$exp3"; for my $table ($html =~ m,<${T1}${exp1}[^>]*>(.*?)<\s*/\s*${T1}\s*>,gi) { - $table_num++; - #warn $table_num; - next if defined $Opt{'num'} and $Opt{num} and $Opt{num} != $table_num; - - $table = "<>$table" unless $T2; - for my $tr ($table =~ m,<(${T2})${exp2}[^>]*>(.*?)<\s*/\s*${T2}\s*>,gi) - { - my @col; - print "$table_num " if $Opt{'print-num'}; - - $tr = "<>$tr" unless $T3; - my $count = 0; - - #@_ = $tr =~ m,(]*>)\s*]*>(.*?),; - #die @_ if @_; - $tr =~ s,]*>\s*]*href="([^"]+)"[^>]*>(.*?),$1$2,g - if $Opt{href2td}; - - for my $td ($tr =~ m,<${T3}${exp3}[^>]*>(.*?)<\s*/\s*${T3}\s*>,gi) - { - $td =~ s,]*href="([^"]+)"[^>]*>(.*?),[href=$1]$2,g - if $Opt{href2txt}; - $td = html2txt($td) unless $Opt{html}; - $_ = chr(194).chr(160); $td =~ s/$_/ /g; - $td =~ s/\s+/ /g; - $td = str_trim($td); - $count++; - - push(@col,$td); - if ($COUNT and $count > $COUNT) - { - print join($Opt{sep},@col)."\n"; - @col = (); - } - } - - print join($Opt{sep},@col)."\n" if @col; - #print "\n" unless $T3; - - } - #print "\n" unless $T2; + $table_num++; + #warn $table_num; + next if defined $Opt{'num'} and $Opt{num} and $Opt{num} != $table_num; + + $table = "<>$table" unless $T2; + for my $tr ($table =~ m,<(${T2})${exp2}[^>]*>(.*?)<\s*/\s*${T2}\s*>,gi) + { + my @col; + print "$table_num " if $Opt{'print-num'}; + + $tr = "<>$tr" unless $T3; + my $count = 0; + + #@_ = $tr =~ m,(]*>)\s*]*>(.*?),; + #die @_ if @_; + $tr =~ + s,]*>\s*]*href="([^"]+)"[^>]*>(.*?),$1$2,g + if $Opt{href2td}; + + for my $td ($tr =~ m,<${T3}${exp3}[^>]*>(.*?)<\s*/\s*${T3}\s*>,gi) + { + $td =~ s,]*href="([^"]+)"[^>]*>(.*?),[href=$1]$2,g + if $Opt{href2txt}; + + my $hrefTxt = ''; + if ($Opt{nohref}) + { + $td =~ s,]*href="([^"]+)"[^>]*>(.*?),,g; + $hrefTxt = $2; + #$td = $2 if $td =~ /^\s*$/; + } + + $td = html2txt($td) unless $Opt{html}; + $_ = chr(194).chr(160); $td =~ s/$_/ /g; + $td =~ s/\s+/ /g; + $td = str_trim($td); + + $td = $hrefTxt if $Opt{nohref} and $td eq '' and $hrefTxt; + $count++; + + push(@col,$td); + if ($COUNT and $count > $COUNT) + { + print join($Opt{sep},@col)."\n"; + @col = (); + } + } + + print join($Opt{sep},@col)."\n" if @col; + #print "\n" unless $T3; + + } + #print "\n" unless $T2; } @@ -233,6 +245,7 @@ Quick usage: -option[tag|T=s] table|dl Default: table -option[href2td!] Treat first column href as a new column -option[href2txt!] + -option[nohref!] Don't add href to column -option[sep|s=s] Default: tab (\t) -option[html!] Keep html entities (default: yes) -option[num|n=i] Only dump table number -- 2.47.3