From 8d3138caa1ee74f755d2d3c7c3de21ccfc92f991 Mon Sep 17 00:00:00 2001 From: Nicolas Boisselier Date: Mon, 21 May 2018 18:57:06 +0100 Subject: [PATCH] bin/html2csv --- bin/html2csv | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/bin/html2csv b/bin/html2csv index 0e280c5d..f525c13e 100755 --- a/bin/html2csv +++ b/bin/html2csv @@ -29,6 +29,7 @@ my $DEBUG = $main::DEBUG = 0; my %Opt = ( 'num' => undef, + 'print_num' => 0, 'tag' => 'table', 'exp1'=> '', 'exp2'=> '', @@ -48,6 +49,7 @@ $main::_DATA_ = undef; @ARGV = map {m,^\w+://, ? "curl -s '$_' |" : $_} @ARGV if @ARGV; my $html = join('',<>); $html =~ s/[\r\n]+//g; +$_ = $Opt{sep}; $html =~ s/$_/ /g; $html =~ s/\s+/ /g; my %TAGS = ( @@ -60,35 +62,48 @@ my %TAGS = ( 'dl', '', 'd[td]', + 2, ], ); exists $TAGS{$Opt{tag}} or die "$NAME: Unknown tag '$Opt{tag}'\n"; -my ($T1,$T2,$T3) = @{ $TAGS{$Opt{tag}} }; +my ($T1,$T2,$T3,$COUNT) = @{ $TAGS{$Opt{tag}} }; +$COUNT ||= 0; my $table_num = 0; my $exp1 = $Opt{exp1}; $exp1 and $exp1 = "[^>]*$exp1"; my $exp2 = $Opt{exp2}; $exp2 and $exp2 = "[^>]*$exp2"; my $exp3 = $Opt{exp3}; $exp3 and $exp3 = "[^>]*$exp3"; + for my $table ($html =~ m,<${T1}${exp1}[^>]*>(.*?)<\s*/\s*${T1}\s*>,gi) { $table_num++; #warn $table_num; next if defined $Opt{'num'} and $Opt{num} and $Opt{num} != $table_num; $table = "<>$table" unless $T2; - for my $tr ($table =~ m,<${T2}${exp2}[^>]*>(.*?)<\s*/\s*${T2}\s*>,gi) { + for my $tr ($table =~ m,<(${T2})${exp2}[^>]*>(.*?)<\s*/\s*${T2}\s*>,gi) { my @col; + print "$table_num " if $Opt{print_num}; + $tr = "<>$tr" unless $T3; + my $count = 0; for my $td ($tr =~ m,<${T3}${exp3}[^>]*>(.*?)<\s*/\s*${T3}\s*>,gi) { $td = html2txt($td) unless $Opt{html}; $_ = chr(194).chr(160); $td =~ s/$_/ /g; $td =~ s/\s+/ /g; $td = str_trim($td); + $count++; push(@col,$td); + if (1 and $COUNT and $count > $COUNT) { + print join($Opt{sep},@col)."\n"; + @col = (); + } } - print join($Opt{sep},@col)."\n"; + print join($Opt{sep},@col)."\n" if @col; + #print "\n" unless $T3; } + #print "\n" unless $T2; } @@ -206,6 +221,7 @@ Quick usage: -option[sep|s=s] Default: tab (\t) -option[html!] Escape html (default: yes) -option[num|n=i] Only dump table number + -option[print_num|pn!] Print num tag preffix -option[exp1|E1=s] Regexp filter on tag 1 -option[exp2|E2=s] Regexp filter on tag 2 -option[exp3|E3=s] Regexp filter on tag 3 -- 2.47.3