]> git.nbdom.net Git - nb.git/commitdiff
lib/postgres/w3schools.sh
authorNicolas Boisselier <nicolas.boisselier@gmail.com>
Sun, 4 Jun 2023 23:44:08 +0000 (01:44 +0200)
committerNicolas Boisselier <nicolas.boisselier@gmail.com>
Sun, 4 Jun 2023 23:44:08 +0000 (01:44 +0200)
bin/html2csv
lib/postgres/css_entity.sh
lib/postgres/css_function.sh
lib/postgres/css_reference.sh
lib/postgres/css_selector.sh
lib/postgres/html_attribute.sh
lib/postgres/html_tag.sh
lib/postgres/w3schools.sh [new file with mode: 0755]

index c54d16eb807584f08a83ffc182f8b8063264e57d..a49e3d42141638ee2dd980445dc875c77695a6d6 100755 (executable)
@@ -91,11 +91,13 @@ for my $table ($html =~ m,<${T1}${exp1}[^>]*>(.*?)<\s*/\s*${T1}\s*>,gi)
 
     #@_ = $tr =~ m,(<td[^>]*>)\s*<a\s+href="([^"]+)"[^>]*>(.*?)</a>,;
     #die @_ if @_;
-    $tr =~ s,<td[^>]*>\s*<a\s+href="([^"]+)"[^>]*>(.*?)</a>,<td>$1</td><td>$2,g
-       if $Opt{href};
+    $tr =~ s,<td[^>]*>\s*<a\s+[^>]*href="([^"]+)"[^>]*>(.*?)</a>,<td>$1</td><td>$2,g
+       if $Opt{href2td};
 
     for my $td ($tr =~ m,<${T3}${exp3}[^>]*>(.*?)<\s*/\s*${T3}\s*>,gi)
     {
+    $td =~ s,<a\s+[^>]*href="([^"]+)"[^>]*>(.*?)</a>,[href=$1]$2,g
+       if $Opt{href2txt};
       $td = html2txt($td) unless $Opt{html};
       $_ = chr(194).chr(160); $td =~ s/$_/ /g;
       $td =~ s/\s+/ /g;
@@ -229,7 +231,8 @@ Quick usage:
 =head1 OPTIONS
 
  -option[tag|T=s] table|dl Default: table
- -option[href!] Treat href as a column
+ -option[href2td!] Treat first column href as a new column
+ -option[href2txt!] 
  -option[sep|s=s] Default: tab (\t)
  -option[html!] Escape html (default: yes)
  -option[num|n=i] Only dump table number
index e29af082b7dc351709a3e462bb44dd45b42a038c..b152fb6602128a2592dd418c13cb007c9f30f684 100755 (executable)
@@ -1,9 +1,3 @@
-#!/bin/bash -l
-#(chr,entity,name)
-http_get https://www.w3schools.com/cssref/css_entities.php | \
-xmllint --html --xpath '/html/body//div[@id="main"]//table' - 2>/dev/null | \
-xml2csv - | \
-tail -n+2 | \
-sed -E "s,(\\\\),\1\1," | \
-awk 'BEGIN{FS="\t"}{print $2"\t"$1"\t"$3}' \
-
+#!/bin/bash
+"$(dirname "$0")"/w3schools.sh /cssref/css_entities.php \
+       | awk -F$'\t' '{print $2"\t"$1"\t"$3}'
index 32387b4dc85fca5d1f7ded5947a5785ceed0aac3..64ee550efc5261a720360236e0e03f0fea883ccd 100755 (executable)
@@ -1,8 +1,2 @@
-#!/bin/bash -l
-http_get https://www.w3schools.com/cssref/css_functions.php | \
-xmllint --html --xpath "/html/body//div[@id=\"main\"]//table" - 2>/dev/null | \
-sed -E "s, href=\"([^\"]+)\",>https://www.w3schools.com/cssref/\1|</a><a,g" | \
-xml2csv - | \
-grep "|" | \
-sed "s/|/\t/" \
-| awk 'BEGIN{FS="\t"}{print $2"\t"$3"\t"$1}' \
+#!/bin/bash
+"$(dirname "$0")"/w3schools.sh /cssref/css_functions.php
index ad73d4cf23899e7298dc33078516680e989cb324..7f99602e92f1d17cf58f5b4e13927b46879d833b 100755 (executable)
@@ -1,8 +1,2 @@
-#!/bin/bash -l
-http_get https://www.w3schools.com/cssref/index.php | \
-xmllint --html --xpath '/html/body//div[@id="cssproperties"]//table' - 2>/dev/null | \
-perl -pe 's|<td[^>]*><a href="([^"]+)">([^<]+)</a></td>|<td>https://www.w3schools.com/cssref/$1</td><td>$2</td>|' | \
-html2csv | \
-sed -E "s,(\\\\),\1\1," | \
-awk 'BEGIN{FS="\t"} /www\.w3schools\.com/ {print $2"\t"$3"\t"$1}' \
-
+#!/bin/bash
+"$(dirname "$0")"/w3schools.sh /cssref/index.php
index 028132aae6238bc460213ea3cecffdc63fc533da..06d412df5c434fc35dc5fb1157d26e542188089b 100755 (executable)
@@ -1,8 +1,2 @@
-#!/bin/bash -l
-http_get https://www.w3schools.com/cssref/css_selectors.php \
-| xmllint --html --xpath "/html/body//div[@id=\"main\"]//table" - 2>/dev/null \
-| sed -E "s, href=\"([^\"]+)\",>https://www.w3schools.com/cssref/\1|</a><a,g" \
-| xml2csv - \
-| grep "|" \
-| sed "s/|/\t/" \
-| awk 'BEGIN{FS="\t"}{print $2"\t"$3"\t"$4"\t"$1}' \
+#!/bin/bash
+"$(dirname "$0")"/w3schools.sh /cssref/css_selectors.php
index 6bcaa62709f4b9776926f71ec506f455b2ce669c..9940e40a172eb7a3f4b43d27448f5b2e8dd723aa 100755 (executable)
@@ -1,19 +1,2 @@
-#!/bin/bash -l
-
-# NB 01.06.23: TODEL  
-false && http_get https://html.com/attributes/ \
-| sed -E "s,<(thrive_headline|header)[^.>]+>([^<]+)</\1>,<div>\2</div>," \
-| xmllint --html --xpath //table - 2>/dev/null \
-| sed -E "s,<a[^>]+href=\"((https://html.com)?/attributes/([^\"\/]+)\/?)[^<]+</a>,\3</td><td>https://html.com/\1,g" \
-| xml2csv - \
-| awk 'BEGIN{FS="\t"}{print $1"\t"$3"\t"$2}' \
-
-
-true && http_get https://www.w3schools.com/tags/ref_attributes.asp | \
-xmllint --html --xpath '/html/body//div[@id="main"]//table' - 2>/dev/null | \
-grep -v 'href="tag_' | \
-perl -pe 's|<td[^>]*><a href="([^"]+)">([^<]+)</a></td>|<td>https://www.w3schools.com/tags/$1</td><td>$2</td>|' | \
-html2csv | \
-sed -E "s,(\\\\),\1\1," | \
-awk 'BEGIN{FS="\t"} /www\.w3schools\.com/ {print $2"\t"$3"\t"$1}' \
-
+#!/bin/bash
+"$(dirname "$0")"/w3schools.sh /tags/ref_attributes.asp 1,3
index 7063c35885fb76073db25b2143ba7c933f99d801..50204ef664f76d3d797bc8a9dd9684df6df7cba4 100755 (executable)
@@ -1,5 +1,2 @@
-#!/bin/bash -l
-html2txt https://html.com/tags/ \
-| grep "^<" \
-| sed -E -e "s/ *HTML Tag *//" -e "s,^<([^>]+)>,\1\thttps://www.w3schools.com/TAGS/tag_\1.asp\t," \
-| awk 'BEGIN{FS="\t"}{print $1"\t"$3"\t"$2}' \
+#!/bin/bash
+"$(dirname "$0")"/w3schools.sh /TAGS/ | sed -E 's,<([a-z][^>]*)>,\1,'
diff --git a/lib/postgres/w3schools.sh b/lib/postgres/w3schools.sh
new file mode 100755 (executable)
index 0000000..6047c20
--- /dev/null
@@ -0,0 +1,22 @@
+#!/bin/bash -l
+
+path=${1:?Usage: $0 /path/ for url https://www.w3schools.com}
+cut="$2"
+url="https://www.w3schools.com$path"
+preff="$(echo "$url"|sed -E 's,[^/]+\.[^/]+$,,')"
+#exec echo $preff
+
+http_get "$url" | \
+xmllint --format --html --xpath '/html/body//div[@id="main"]//table' - 2>/dev/null | \
+grep -Ev '<th' | \
+html2csv --href2txt | \
+sed -E \
+       -e "s,\[href=([^]]+),[href=$preff\1,g" \
+       -e "s,(\\\\),\1\1," \
+| \
+( test -z "$cut" && cat || cut -f "$cut" ) \
+| perl -pe '
+       s/\[href=([^\]]+)\](.*)$/\2\t\1/;
+       m,\thttp[^\t]+$, or s,$,\thttps://www.w3schools.com/tags/ref_attributes.asp,;
+' \
+;