From 19adb456ebedd969c61431d2459b1be44d55b04d Mon Sep 17 00:00:00 2001 From: Nicolas Boisselier Date: Mon, 22 Jan 2018 01:04:45 +0000 Subject: [PATCH] bin/html-table2csv --- bin/html-table2csv | 46 ++++++++++++++++++++++++++++++---------------- 1 file changed, 30 insertions(+), 16 deletions(-) diff --git a/bin/html-table2csv b/bin/html-table2csv index fc3689f9..29fd4900 100755 --- a/bin/html-table2csv +++ b/bin/html-table2csv @@ -28,7 +28,10 @@ my $VERBOSE = $main::VERBOSE = 1; my $DEBUG = $main::DEBUG = 0; my %Opt = ( - 'table' => undef, + 'nume' => undef, + 'tag' => 'table', + 'sep' => "\t", + 'html' => 0, ); get_options(\%Opt); help() unless @ARGV; @@ -41,26 +44,42 @@ $main::_DATA_ = undef; ################################################################################# @ARGV = map {m,^\w+://, ? "curl -s '$_' |" : $_} @ARGV if @ARGV; my $html = join('',<>); +$html =~ s/[\r\n]+//g; $html =~ s/\s+/ /g; +my %TAGS = ( + 'table' => [ + 'table', + 'tr', + 't[dh]', + ], + 'dl' => [ + 'dl', + '', + 'd[td]', + ], +); +my ($T1,$T2,$T3) = @{ $TAGS{$Opt{tag}} }; my $table_num = 0; -for my $table ($html =~ m,]*>(.*?)<\s*/\s*table\s*>,gi) { +for my $table ($html =~ m,<${T1}[^>]*>(.*?)<\s*/\s*${T1}\s*>,gi) { $table_num++; - next if defined $Opt{'table'} and $Opt{table} and $Opt{table} != $table_num; + #warn $table_num; + next if defined $Opt{'num'} and $Opt{num} and $Opt{num} != $table_num; - for my $tr ($table =~ m,]*>(.*?)<\s*/\s*tr\s*>,gi) { + $table = "<>$table" unless $T2; + for my $tr ($table =~ m,<${T2}[^>]*>(.*?)<\s*/\s*${T2}\s*>,gi) { my @col; - for my $td ($tr =~ m,]*>(.*?)<\s*/\s*t[dh]\s*>,gi) { - $td = html2txt($td); + for my $td ($tr =~ m,<${T3}[^>]*>(.*?)<\s*/\s*${T3}\s*>,gi) { + $td = html2txt($td) unless $Opt{html}; $_ = chr(194).chr(160); $td =~ s/$_/ /g; $td =~ s/\s+/ /g; $td = str_trim($td); push(@col,$td); } - print join("\t",@col)."\n"; + print join($Opt{sep},@col)."\n"; } @@ -176,7 +195,10 @@ Quick usage: =head1 OPTIONS - -option[table|t=i] Only dump table number + -option[tag|T=s] Default: table + -option[sep|s=s] Default: tab + -option[html!] Escape html (default: yes) + -option[num|n=i] Only dump table number -option[verbose|v+] Verbose mode: increase the verbosity level. -option[debug+] Debug mode: increase the verbosity level. -option[version|V] Print version (default: $VERSION) @@ -185,14 +207,6 @@ Quick usage: =cut -=head1 EXAMPLES - -... - -=head1 REQUIRES - -Getopt::Std, Pod::Usage - =head1 COPYRIGHT AND LICENSE Copyright (C) 2017 Nicolas Boisselier -- 2.47.3