my $DEBUG = $main::DEBUG = 0;
my %Opt = (
- 'num' => undef,
- 'print-num' => 0,
- 'tag' => 'table',
- 'exp1'=> '',
- 'exp2'=> '',
- 'exp3'=> '',
- 'sep' => "\t",
- 'html' => 0,
+ 'num' => undef,
+ 'print-num' => 0,
+ 'tag' => 'table',
+ 'exp1'=> '',
+ 'exp2'=> '',
+ 'exp3'=> '',
+ 'sep' => "\t",
+ 'html' => 0,
);
get_options(\%Opt);
# NB 15.05.23 help() unless @ARGV;
$html =~ s/\s+/ /g;
my %TAGS = (
- 'table' => [
- 'table',
- 'tr',
- 't[dh]',
- ],
- 'dl' => [
- 'dl',
- '',
- 'd[td]',
- 2,
- ],
+ 'table' => [
+ 'table',
+ 'tr',
+ 't[dh]',
+ ],
+ 'dl' => [
+ 'dl',
+ '',
+ 'd[td]',
+ 2,
+ ],
);
exists $TAGS{$Opt{tag}} or die "$NAME: Unknown tag '$Opt{tag}'\n";
my ($T1,$T2,$T3,$COUNT) = @{ $TAGS{$Opt{tag}} };
for my $table ($html =~ m,<${T1}${exp1}[^>]*>(.*?)<\s*/\s*${T1}\s*>,gi)
{
- $table_num++;
- #warn $table_num;
- next if defined $Opt{'num'} and $Opt{num} and $Opt{num} != $table_num;
-
- $table = "<>$table</>" unless $T2;
- for my $tr ($table =~ m,<(${T2})${exp2}[^>]*>(.*?)<\s*/\s*${T2}\s*>,gi)
- {
- my @col;
- print "$table_num " if $Opt{'print-num'};
-
- $tr = "<>$tr</>" unless $T3;
- my $count = 0;
-
- #@_ = $tr =~ m,(<td[^>]*>)\s*<a\s+href="([^"]+)"[^>]*>(.*?)</a>,;
- #die @_ if @_;
- $tr =~ s,<td[^>]*>\s*<a\s+[^>]*href="([^"]+)"[^>]*>(.*?)</a>,<td>$1</td><td>$2,g
- if $Opt{href2td};
-
- for my $td ($tr =~ m,<${T3}${exp3}[^>]*>(.*?)<\s*/\s*${T3}\s*>,gi)
- {
- $td =~ s,<a\s+[^>]*href="([^"]+)"[^>]*>(.*?)</a>,[href=$1]$2,g
- if $Opt{href2txt};
- $td = html2txt($td) unless $Opt{html};
- $_ = chr(194).chr(160); $td =~ s/$_/ /g;
- $td =~ s/\s+/ /g;
- $td = str_trim($td);
- $count++;
-
- push(@col,$td);
- if ($COUNT and $count > $COUNT)
- {
- print join($Opt{sep},@col)."\n";
- @col = ();
- }
- }
-
- print join($Opt{sep},@col)."\n" if @col;
- #print "\n" unless $T3;
-
- }
- #print "\n" unless $T2;
+ $table_num++;
+ #warn $table_num;
+ next if defined $Opt{'num'} and $Opt{num} and $Opt{num} != $table_num;
+
+ $table = "<>$table</>" unless $T2;
+ for my $tr ($table =~ m,<(${T2})${exp2}[^>]*>(.*?)<\s*/\s*${T2}\s*>,gi)
+ {
+ my @col;
+ print "$table_num " if $Opt{'print-num'};
+
+ $tr = "<>$tr</>" unless $T3;
+ my $count = 0;
+
+ #@_ = $tr =~ m,(<td[^>]*>)\s*<a\s+href="([^"]+)"[^>]*>(.*?)</a>,;
+ #die @_ if @_;
+ $tr =~
+ s,<td[^>]*>\s*<a\s+[^>]*href="([^"]+)"[^>]*>(.*?)</a>,<td>$1</td><td>$2,g
+ if $Opt{href2td};
+
+ for my $td ($tr =~ m,<${T3}${exp3}[^>]*>(.*?)<\s*/\s*${T3}\s*>,gi)
+ {
+ $td =~ s,<a\s+[^>]*href="([^"]+)"[^>]*>(.*?)</a>,[href=$1]$2,g
+ if $Opt{href2txt};
+
+ my $hrefTxt = '';
+ if ($Opt{nohref})
+ {
+ $td =~ s,<a\s+[^>]*href="([^"]+)"[^>]*>(.*?)</a>,,g;
+ $hrefTxt = $2;
+ #$td = $2 if $td =~ /^\s*$/;
+ }
+
+ $td = html2txt($td) unless $Opt{html};
+ $_ = chr(194).chr(160); $td =~ s/$_/ /g;
+ $td =~ s/\s+/ /g;
+ $td = str_trim($td);
+
+ $td = $hrefTxt if $Opt{nohref} and $td eq '' and $hrefTxt;
+ $count++;
+
+ push(@col,$td);
+ if ($COUNT and $count > $COUNT)
+ {
+ print join($Opt{sep},@col)."\n";
+ @col = ();
+ }
+ }
+
+ print join($Opt{sep},@col)."\n" if @col;
+ #print "\n" unless $T3;
+
+ }
+ #print "\n" unless $T2;
}
-option[tag|T=s] table|dl Default: table
-option[href2td!] Treat first column href as a new column
-option[href2txt!]
+ -option[nohref!] Don't add href to column
-option[sep|s=s] Default: tab (\t)
-option[html!] Keep html entities (default: yes)
-option[num|n=i] Only dump table number