#!/usr/bin/perl $base_url = 'http://sourceforge.net/tracker/index.php?func=browse&group_id=1897&atid=101897&set=custom&_assigned_to=0&_status=100&_category=100&_group=100&order=artifact_id&sort=ASC'; sub shell_quote { # $quoted_string = &shell_quote ($raw_string) my ($raw) = @_; if ($raw eq '') { return '""'; } $raw =~ s/(\W)/\\$1/g; return $raw; } sub wget { my ($url) = @_; my $qurl = shell_quote ($url); system "wget $qurl"; $fn = $url; $fn =~ s/^.*\/([^\/]*)/$1/; return $fn; } sub bug_list_url { my ($offset) = @_; return $base_url.'&offset='.$offset; } sub abs_url { my ($bug_url) = @_; my $base = $base_url; $base =~ s/^(http:\/\/[^\/]+\/).*/$1/; return $base.$bug_url; } sub get_bug_list { my ($offset) = @_; my $url = bug_list_url ($offset); my $count = 0; my $fn = wget ($url); print "$fn\n"; open LIST, "$fn"; while () { if (/A HREF=\"(\/tracker\/index.php\?func=detail\&aid=(\d+)[^\"]*)\"/) { my $bugid = $2; if (!$bugurl{$bugid}) { push @bugs, $bugid; $count++; $bugurl{$bugid} = $1; print "bug $bugid $1\n"; } } } close LIST; return $count; } sub crawl_bug { my ($bug) = @_; my $url = abs_url ($bugurl{$bug}); my $fn = wget ($url); open BUG, "$fn"; while () { if (/A HREF=\"(\/tracker\/download.php\?.*&file\_id=(\d+)[^\"]*)\"/) { $fileid = $2; if (!$fileurl{$fileid}) { push @files, $fileid; $fileurl{$fileid} = $1; print "file $fileid $1\n"; } } } } sub download_file { my ($file) = @_; my $url = abs_url ($fileurl{$file}); my $fn = wget ($url); } sub crawl_bugs { my $offset = 0; while (1) { if (!get_bug_list ($offset)) { last; } $offset += 50; } print join (", ", @bugs)."\n"; foreach my $bug (@bugs) { crawl_bug ($bug); }; print join (", ", @files)."\n"; foreach my $file (@files) { download_file ($file); } } @bugs = (); crawl_bugs();