Difference between revisions of "Software/Spider"

From ThorstensHome
Jump to: navigation, search
m (1 revision(s))
 
(16 intermediate revisions by one user not shown)
Line 1: Line 1:
A program that follows all links in a given html file.
+
{{DISPLAYTITLE:Spider}}
 +
Spider is [[software]] that shows all links in a given html file.
  
= Perl program =
+
= Download =
The following code lists all links in an html file.
+
→ [http://www.staerk.de/files/spider4.tar Download version for KDE 4]
<pre>
+
#!/usr/bin/perl
+
require HTML::LinkExtor;
+
$p = HTML::LinkExtor->new(\&parse, "");
+
sub parse
+
{
+
  my($tag, %links) = @_;
+
  my ($att, $url) = @{[%links]};
+
  print "$url\n";
+
}
+
if ($ARGV[0])
+
{
+
  $p->parse_file($ARGV[0]);
+
}
+
else
+
{
+
  print "Usage: spider.pl htmlfile.htm\n";
+
}
+
</pre>
+
  
= KDE programs =
+
= Installation =
&rarr; [http://www.staerk.de/files/spider4.tar Download version for KDE 4]<br />
+
Here I describe the installation for a default SUSE 11.1 distribution:
&rarr; [http://www.staerk.de/files/spider3.tar Download version for KDE 3]
+
tar xvf spider4.tar
 +
cd spider
 +
zast -i libtidy-devel
 +
cmake . && make -j4 && make install
  
== spider for KDE 4 ==
+
= Usage =
 +
itchy:~/repos/spider # ./spider example.htm 2>/dev/null
 +
http://www.linuxintro.org
  
Problem:
+
= Example =
<body>
+
is recognized correctly, but not
+
<body lang=DE link=blue vlink=purple bgcolor=#eeeeff>
+
This is because for XML it should be like
+
<body lang="DE" link="blue" vlink="purple" bgcolor="#eeeeff">
+
 
+
Solution:
+
 
+
Use [http://tidy.sourceforge.net/tidy tidy] to make sure your html file is an xhtml file.
+
Or use
+
QXmlQuery query;
+
query.setQuery("index.html", "/html/body/h1");
+
QStringList headings;
+
query.evaluateTo(&headings);
+
 
+
= Spider.pl =
+
Spider.pl follows all links in an html file:
+
 
<pre>
 
<pre>
#!/usr/bin/perl
+
# wget www.staerk.de/thorsten 2>/dev/null; spider Main_Page 2>/dev/null
# spider.pl (c) 2008 by Thorsten Staerk
+
#column-one
# This program extracts links in a web page and follows them.
+
#searchInput
require HTML::LinkExtor;
+
/thorsten/index.php/C_Programming_Tutorial
$p = HTML::LinkExtor->new(\&parse, "");
+
http://www.linuxintro.org/wiki/umts
 +
http://www.linuxintro.org/wiki/bluetooth
 +
/thorsten/index.php/My_Tutorials
 +
/thorsten/index.php/Html2mediawiki
 +
/thorsten/index.php/Dospath
 +
/thorsten/index.php/Software
 +
/thorsten/index.php/What_I_always_wanted_to_say
 +
/thorsten/index.php/About_this_site
 +
/thorsten/index.php/Image:Walschaerts_motion.gif
 +
http://en.wikipedia.org/wiki/Image:Walschaerts_motion.gif
 +
http://www.staerk.de/thorsten/index.php/Main_Page
 +
/thorsten/index.php/Main_Page
 +
/thorsten/index.php?title=Talk:Main_Page&action=edit
 +
/thorsten/index.php?title=Main_Page&action=edit
 +
/thorsten/index.php?title=Main_Page&action=history
 +
/thorsten/index.php?title=Special:Userlogin&returnto=Main_Page
 +
/thorsten/index.php/Main_Page
 +
/thorsten/index.php/Special:Recentchanges
 +
/thorsten/index.php/Special:Random
 +
/thorsten/index.php/Help:Contents
 +
/thorsten/index.php/Special:Whatlinkshere/Main_Page
 +
/thorsten/index.php/Special:Recentchangeslinked/Main_Page
 +
/thorsten/index.php/Special:Upload
 +
/thorsten/index.php/Special:Specialpages
 +
/thorsten/index.php?title=Main_Page&printable=yes
 +
/thorsten/index.php?title=Main_Page&oldid=1534
 +
http://www.mediawiki.org/
 +
http://www.gnu.org/copyleft/fdl.html
 +
http://www.gnu.org/copyleft/fdl.html
 +
/thorsten/index.php/ThorstensHome:Privacy_policy
 +
/thorsten/index.php/ThorstensHome:About
 +
/thorsten/index.php/ThorstensHome:General_disclaimer
 +
</pre>
  
sub parse
+
= See also =
{
+
* [[add_toc]]
    my($tag, %links) = @_;
+
* [[html2mediawiki]]
    my ($att, $url) = @{[%links]};
+
    #print "$url\n";
+
    get($url);
+
}
+
 
+
sub parse_if_ok()
+
{
+
  $filename=$_[0];
+
  $level=$_[1];
+
  $p->parse_file($filename);
+
}
+
 
+
#int main
+
get("http://www.heise.de", 0);
+
 
+
sub get( $url )
+
{
+
  print "Entering get $_[0]\n";
+
  $url=$_[0];
+
  $level=$_[1];
+
  # return if no http:// url
+
    $newurl=$url;
+
    $newurl =~ s\http://\\;
+
    if ($newurl eq $url) {print "returning\n"; return};
+
  use LWP::UserAgent;
+
  $agent = LWP::UserAgent->new;
+
  $answer = HTTP::Request->new(GET => $url);
+
  $answer->header('Accept' => 'text/html');
+
  $res = $agent->request($answer);
+
  if ($res->is_success)
+
  {
+
    print "successfully got response";
+
    $number="1";
+
    $oldurl=url;
+
    $url =~ s\http://\\;
+
    if ($oldurl eq $url) {return};
+
    print "still here";
+
    $filename=$url;
+
    $filename=~ s\/\-\g;
+
    open(FILE, ">"."$filename");
+
    print (FILE $res->content);
+
    print "printing file $filename";
+
    $p->parse_file("file$url");
+
 
+
  }
+
  else
+
  {
+
    print "Error: " . $res->status_line . "\n";
+
  }
+
}
+
</pre>
+

Latest revision as of 10:32, 10 August 2013

Spider is software that shows all links in a given html file.

Contents

Download

Download version for KDE 4

Installation

Here I describe the installation for a default SUSE 11.1 distribution:

tar xvf spider4.tar
cd spider
zast -i libtidy-devel
cmake . && make -j4 && make install

Usage

itchy:~/repos/spider # ./spider example.htm 2>/dev/null
http://www.linuxintro.org

Example

# wget www.staerk.de/thorsten 2>/dev/null; spider Main_Page 2>/dev/null
#column-one
#searchInput
/thorsten/index.php/C_Programming_Tutorial
http://www.linuxintro.org/wiki/umts
http://www.linuxintro.org/wiki/bluetooth
/thorsten/index.php/My_Tutorials
/thorsten/index.php/Html2mediawiki
/thorsten/index.php/Dospath
/thorsten/index.php/Software
/thorsten/index.php/What_I_always_wanted_to_say
/thorsten/index.php/About_this_site
/thorsten/index.php/Image:Walschaerts_motion.gif
http://en.wikipedia.org/wiki/Image:Walschaerts_motion.gif
http://www.staerk.de/thorsten/index.php/Main_Page
/thorsten/index.php/Main_Page
/thorsten/index.php?title=Talk:Main_Page&action=edit
/thorsten/index.php?title=Main_Page&action=edit
/thorsten/index.php?title=Main_Page&action=history
/thorsten/index.php?title=Special:Userlogin&returnto=Main_Page
/thorsten/index.php/Main_Page
/thorsten/index.php/Special:Recentchanges
/thorsten/index.php/Special:Random
/thorsten/index.php/Help:Contents
/thorsten/index.php/Special:Whatlinkshere/Main_Page
/thorsten/index.php/Special:Recentchangeslinked/Main_Page
/thorsten/index.php/Special:Upload
/thorsten/index.php/Special:Specialpages
/thorsten/index.php?title=Main_Page&printable=yes
/thorsten/index.php?title=Main_Page&oldid=1534
http://www.mediawiki.org/
http://www.gnu.org/copyleft/fdl.html
http://www.gnu.org/copyleft/fdl.html
/thorsten/index.php/ThorstensHome:Privacy_policy
/thorsten/index.php/ThorstensHome:About
/thorsten/index.php/ThorstensHome:General_disclaimer

See also