274 lines
5 KiB
Go
274 lines
5 KiB
Go
package robotstxt
|
|
|
|
import (
|
|
"strings"
|
|
"testing"
|
|
"time"
|
|
)
|
|
|
|
const (
|
|
robotsCaseMatching = `user-agent: a
|
|
Disallow: /
|
|
user-agent: b
|
|
Disallow: /*
|
|
user-agent: c
|
|
Disallow: /fish
|
|
user-agent: d
|
|
Disallow: /fish*
|
|
user-agent: e
|
|
Disallow: /fish/
|
|
user-agent: f
|
|
Disallow: fish/
|
|
user-agent: g
|
|
Disallow: /*.php
|
|
user-agent: h
|
|
Disallow: /*.php$
|
|
user-agent: i
|
|
Disallow: /fish*.php`
|
|
|
|
robotsCasePrecedence = `user-agent: a
|
|
Disallow: /
|
|
Allow: /p
|
|
user-agent: b
|
|
Disallow: /folder
|
|
Allow: /folder/
|
|
user-agent: c
|
|
Disallow: /*.htm
|
|
Allow: /page
|
|
user-agent: d
|
|
Disallow: /
|
|
Allow: /$
|
|
user-agent: e
|
|
Disallow: /
|
|
Allow: /$`
|
|
)
|
|
|
|
func TestGroupOrder(t *testing.T) {
|
|
const robotsCaseOrder = `user-agent: googlebot-news
|
|
Disallow: /
|
|
user-agent: *
|
|
Disallow: /
|
|
user-agent: googlebot
|
|
Disallow: /`
|
|
agents := []string{"Googlebot-News (Googlebot)", "Googlebot", "Googlebot-Image (Googlebot)", "Otherbot (web)", "Otherbot (News)"}
|
|
paths := []string{"/1", "/3", "/3", "/2", "/2"}
|
|
|
|
if r, e := FromString(robotsCaseOrder); e != nil {
|
|
t.Fatal(e)
|
|
} else {
|
|
for i, a := range agents {
|
|
ExpectDisallow(t, r, paths[i], a)
|
|
}
|
|
}
|
|
}
|
|
|
|
func TestSitemaps(t *testing.T) {
|
|
const robotsCaseSitemaps = `sitemap: http://test.com/a
|
|
user-agent: a
|
|
disallow: /c
|
|
sitemap: http://test.com/b
|
|
user-agent: b
|
|
disallow: /d
|
|
user-agent: e
|
|
sitemap: http://test.com/c
|
|
user-agent: f
|
|
disallow: /g`
|
|
|
|
if r, e := FromString(robotsCaseSitemaps); e != nil {
|
|
t.Fatal(e)
|
|
} else {
|
|
if len(r.Sitemaps) != 3 {
|
|
for i, s := range r.Sitemaps {
|
|
t.Logf("Sitemap %d: %s", i, s)
|
|
}
|
|
t.Fatalf("Expected 3 sitemaps, got %d:\n%v", len(r.Sitemaps), r.Sitemaps)
|
|
}
|
|
}
|
|
}
|
|
|
|
func TestCrawlDelays(t *testing.T) {
|
|
const robotsCaseDelays = `useragent: a
|
|
# some comment : with colon
|
|
disallow: /c
|
|
user-agent: b
|
|
crawldelay: 3.5
|
|
disallow: /d
|
|
user-agent: e
|
|
sitemap: http://test.com/c
|
|
user-agent: f
|
|
disallow: /g
|
|
crawl-delay: 5`
|
|
|
|
if r, e := FromString(robotsCaseDelays); e != nil {
|
|
t.Fatal(e)
|
|
} else {
|
|
if len(r.Sitemaps) != 1 {
|
|
t.Fatalf("Expected 1 sitemaps, got %d", len(r.Sitemaps))
|
|
}
|
|
if g := r.groups["b"]; g.CrawlDelay != time.Duration(3.5*float64(time.Second)) {
|
|
t.Fatalf("Expected crawl delay of 3.5 for group 2, got %v", g.CrawlDelay)
|
|
}
|
|
if g := r.groups["f"]; g.CrawlDelay != (5 * time.Second) {
|
|
t.Fatalf("Expected crawl delay of 5 for group 3, got %v", g.CrawlDelay)
|
|
}
|
|
}
|
|
}
|
|
|
|
func TestWildcards(t *testing.T) {
|
|
const robotsCaseWildcards = `user-agent: *
|
|
Disallow: /path*l$`
|
|
|
|
if r, e := FromString(robotsCaseWildcards); e != nil {
|
|
t.Fatal(e)
|
|
} else {
|
|
if s := r.groups["*"].rules[0].pattern.String(); s != "/path.*l$" {
|
|
t.Fatalf("Expected pattern to be /path.*l$, got %s", s)
|
|
}
|
|
}
|
|
}
|
|
|
|
func TestURLMatching(t *testing.T) {
|
|
var ok bool
|
|
|
|
cases := map[string][]string{
|
|
"a": []string{
|
|
"/",
|
|
"/test",
|
|
"",
|
|
"/path/to/whatever",
|
|
},
|
|
"b": []string{
|
|
"/",
|
|
"/test",
|
|
"",
|
|
"/path/to/whatever",
|
|
},
|
|
"c": []string{
|
|
"/fish",
|
|
"/fish.html",
|
|
"/fish/salmon.html",
|
|
"/fishheads",
|
|
"/fishheads/yummy.html",
|
|
"/fish.php?id=anything",
|
|
"^/Fish.asp",
|
|
"^/catfish",
|
|
"^/?id=fish",
|
|
},
|
|
"d": []string{
|
|
"/fish",
|
|
"/fish.html",
|
|
"/fish/salmon.html",
|
|
"/fishheads",
|
|
"/fishheads/yummy.html",
|
|
"/fish.php?id=anything",
|
|
"^/Fish.asp",
|
|
"^/catfish",
|
|
"^/?id=fish",
|
|
},
|
|
"e": []string{
|
|
"/fish/",
|
|
"/fish/?id=anything",
|
|
"/fish/salmon.htm",
|
|
"^/fish",
|
|
"^/fish.html",
|
|
"^/Fish/Salmon.asp",
|
|
},
|
|
"f": []string{
|
|
"/fish/",
|
|
"/fish/?id=anything",
|
|
"/fish/salmon.htm",
|
|
"^/fish",
|
|
"^/fish.html",
|
|
"^/Fish/Salmon.asp",
|
|
},
|
|
"g": []string{
|
|
"/filename.php",
|
|
"/folder/filename.php",
|
|
"/folder/filename.php?parameters",
|
|
"/folder/any.php.file.html",
|
|
"/filename.php/",
|
|
"^/",
|
|
"^/windows.PHP",
|
|
},
|
|
"h": []string{
|
|
"/filename.php",
|
|
"/folder/filename.php",
|
|
"^/filename.php?parameters",
|
|
"^/filename.php/",
|
|
"^/filename.php5",
|
|
"^/windows.PHP",
|
|
},
|
|
"i": []string{
|
|
"/fish.php",
|
|
"/fishheads/catfish.php?parameters",
|
|
"^/Fish.PHP",
|
|
},
|
|
}
|
|
if r, e := FromString(robotsCaseMatching); e != nil {
|
|
t.Fatal(e)
|
|
} else {
|
|
for k, ar := range cases {
|
|
for _, p := range ar {
|
|
ok = strings.HasPrefix(p, "^")
|
|
if ok {
|
|
p = p[1:]
|
|
}
|
|
if allow := r.TestAgent(p, k); allow != ok {
|
|
t.Errorf("Agent %s, path %s, expected %v, got %v", k, p, ok, allow)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
func TestURLPrecedence(t *testing.T) {
|
|
var ok bool
|
|
|
|
cases := map[string][]string{
|
|
"a": []string{
|
|
"/page",
|
|
"^/test",
|
|
},
|
|
"b": []string{
|
|
"/folder/page",
|
|
"^/folder1",
|
|
"^/folder.htm",
|
|
},
|
|
"c": []string{
|
|
"^/page.htm",
|
|
"/page1.asp",
|
|
},
|
|
"d": []string{
|
|
"/",
|
|
"^/index",
|
|
},
|
|
"e": []string{
|
|
"^/page.htm",
|
|
"/",
|
|
},
|
|
}
|
|
if r, e := FromString(robotsCasePrecedence); e != nil {
|
|
t.Fatal(e)
|
|
} else {
|
|
for k, ar := range cases {
|
|
for _, p := range ar {
|
|
ok = !strings.HasPrefix(p, "^")
|
|
if !ok {
|
|
p = p[1:]
|
|
}
|
|
if allow := r.TestAgent(p, k); allow != ok {
|
|
t.Errorf("Agent %s, path %s, expected %v, got %v", k, p, ok, allow)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
func getIndexInSlice(ar []*Group, g *Group) int {
|
|
for i, v := range ar {
|
|
if v == g {
|
|
return i
|
|
}
|
|
}
|
|
return -1
|
|
}
|