youtubebeat/vendor/github.com/temoto/robotstxt/robotstxt_test.go

344 lines
14 KiB
Go
Raw Blame History

package robotstxt
import (
"testing"
)
func TestFromStatusAndStringBasic(t *testing.T) {
if _, err := FromStatusAndString(200, ""); err != nil {
t.Fatal("FromStatusAndString MUST accept 200/\"\"")
}
if _, err := FromStatusAndString(401, ""); err != nil {
t.Fatal("FromStatusAndString MUST accept 401/\"\"")
}
if _, err := FromStatusAndString(403, ""); err != nil {
t.Fatal("FromStatusAndString MUST accept 403/\"\"")
}
if _, err := FromStatusAndString(404, ""); err != nil {
t.Fatal("FromStatusAndString MUST accept 404/\"\"")
}
}
func ExpectAllow(t *testing.T, r *RobotsData, path, agent string) {
if !r.TestAgent(path, agent) {
t.Fatalf("Expected allow path '%s' agent '%s'", path, agent)
}
}
func ExpectDisallow(t *testing.T, r *RobotsData, path, agent string) {
if r.TestAgent(path, agent) {
t.Fatalf("Expected disallow path '%s' agent '%s'", path, agent)
}
}
func ExpectAllowAll(t *testing.T, r *RobotsData) {
if !r.TestAgent("/", "Somebot") {
t.Fatal("Expected allow all")
}
}
func ExpectDisallowAll(t *testing.T, r *RobotsData) {
if r.TestAgent("/", "Somebot") {
t.Fatal("Expected disallow all")
}
}
func TestStatus401(t *testing.T) {
r, _ := FromStatusAndString(401, "")
ExpectAllowAll(t, r)
}
func TestStatus403(t *testing.T) {
r, _ := FromStatusAndString(403, "")
ExpectAllowAll(t, r)
}
func TestStatus404(t *testing.T) {
r, _ := FromStatusAndString(404, "")
ExpectAllowAll(t, r)
}
func TestFromStringBasic(t *testing.T) {
if _, err := FromString(""); err != nil {
t.Fatal("FromString MUST accept \"\"")
}
}
func TestFromStringEmpty(t *testing.T) {
r, _ := FromString("")
if allow := r.TestAgent("/", "Somebot"); !allow {
t.Fatal("FromString(\"\") MUST allow everything.")
}
}
func TestFromStringComment(t *testing.T) {
if _, err := FromString("# comment"); err != nil {
t.Fatal("FromString MUST accept \"# comment\"")
}
}
func TestFromString001(t *testing.T) {
r, err := FromString("User-Agent: *\r\nDisallow: /\r\n")
if err != nil {
t.Fatal(err.Error())
}
allow := r.TestAgent("/foobar", "SomeAgent")
if allow {
t.Fatal("Must deny.")
}
}
func TestFromString002(t *testing.T) {
r, err := FromString("User-Agent: *\r\nDisallow: /account\r\n")
if err != nil {
t.Fatal(err.Error())
}
allow := r.TestAgent("/foobar", "SomeAgent")
if !allow {
t.Fatal("Must allow.")
}
}
const robotsText001 = "User-agent: * \nDisallow: /administrator/\nDisallow: /cache/\nDisallow: /components/\nDisallow: /editor/\nDisallow: /forum/\nDisallow: /help/\nDisallow: /images/\nDisallow: /includes/\nDisallow: /language/\nDisallow: /mambots/\nDisallow: /media/\nDisallow: /modules/\nDisallow: /templates/\nDisallow: /installation/\nDisallow: /getcid/\nDisallow: /tooltip/\nDisallow: /getuser/\nDisallow: /download/\nDisallow: /index.php?option=com_phorum*,quote=1\nDisallow: /index.php?option=com_phorum*phorum_query=search\nDisallow: /index.php?option=com_phorum*,newer\nDisallow: /index.php?option=com_phorum*,older\n\nUser-agent: Yandex\nAllow: /\nSitemap: http://www.pravorulya.com/sitemap.xml\nSitemap: http://www.pravorulya.com/sitemap1.xml"
func TestFromString003(t *testing.T) {
r, err := FromString(robotsText001)
if err != nil {
t.Fatal(err.Error())
}
allow := r.TestAgent("/administrator/", "SomeBot")
if allow {
t.Fatal("Must deny.")
}
}
func TestFromString004(t *testing.T) {
r, err := FromString(robotsText001)
if err != nil {
t.Fatal(err.Error())
}
allow := r.TestAgent("/paruram", "SomeBot")
if !allow {
t.Fatal("Must allow.")
}
}
func TestInvalidEncoding(t *testing.T) {
// Invalid UTF-8 encoding should not break parser.
_, err := FromString("User-agent: H\xef\xbf\xbdm<64>h<EFBFBD>kki\nDisallow: *")
if err != nil {
t.Fatal(err.Error())
}
}
// http://www.google.com/robots.txt on Wed, 12 Jan 2011 12:22:20 GMT
const robotsText002 = ("User-agent: *\nDisallow: /search\nDisallow: /groups\nDisallow: /images\nDisallow: /catalogs\nDisallow: /catalogues\nDisallow: /news\nAllow: /news/directory\nDisallow: /nwshp\nDisallow: /setnewsprefs?\nDisallow: /index.html?\nDisallow: /?\nDisallow: /addurl/image?\nDisallow: /pagead/\nDisallow: /relpage/\nDisallow: /relcontent\nDisallow: /imgres\nDisallow: /imglanding\nDisallow: /keyword/\nDisallow: /u/\nDisallow: /univ/\nDisallow: /cobrand\nDisallow: /custom\nDisallow: /advanced_group_search\nDisallow: /googlesite\nDisallow: /preferences\nDisallow: /setprefs\nDisallow: /swr\nDisallow: /url\nDisallow: /default\nDisallow: /m?\nDisallow: /m/?\nDisallow: /m/blogs?\nDisallow: /m/directions?\nDisallow: /m/ig\nDisallow: /m/images?\nDisallow: /m/local?\nDisallow: /m/movies?\nDisallow: /m/news?\nDisallow: /m/news/i?\nDisallow: /m/place?\nDisallow: /m/products?\nDisallow: /m/products/\nDisallow: /m/setnewsprefs?\nDisallow: /m/search?\nDisallow: /m/swmloptin?\nDisallow: /m/trends\nDisallow: /m/video?\nDisallow: /wml?\nDisallow: /wml/?\nDisallow: /wml/search?\nDisallow: /xhtml?\nDisallow: /xhtml/?\nDisallow: /xhtml/search?\nDisallow: /xml?\nDisallow: /imode?\nDisallow: /imode/?\nDisallow: /imode/search?\nDisallow: /jsky?\nDisallow: /jsky/?\nDisallow: /jsky/search?\nDisallow: /pda?\nDisallow: /pda/?\nDisallow: /pda/search?\nDisallow: /sprint_xhtml\nDisallow: /sprint_wml\nDisallow: /pqa\nDisallow: /palm\nDisallow: /gwt/\nDisallow: /purchases\nDisallow: /hws\nDisallow: /bsd?\nDisallow: /linux?\nDisallow: /mac?\nDisallow: /microsoft?\nDisallow: /unclesam?\nDisallow: /answers/search?q=\nDisallow: /local?\nDisallow: /local_url\nDisallow: /froogle?\nDisallow: /products?\nDisallow: /products/\nDisallow: /froogle_\nDisallow: /product_\nDisallow: /products_\nDisallow: /products;\nDisallow: /print\nDisallow: /books\nDisallow: /bkshp?q=\nAllow: /booksrightsholders\nDisallow: /patents?\nDisallow: /patents/\nAllow: /patents/about\nDisallow: /scholar\nDisallow: /complete\nDisallow: /sponsoredlinks\nDisallow: /videosearch?\nDisallow: /videopreview?\nDisallow: /videoprograminfo?\nDisallow: /maps?\nDisallow: /mapstt?\nDisallow: /mapslt?\nDisallow: /maps/stk/\nDisallow: /maps/br?\nDisallow: /mapabcpoi?\nDisallow: /maphp?\nDisallow: /places/\nAllow: /places/$\nDisallow: /maps/place\nDisallow: /help/maps/streetview/partners/welcome/\nDisallow: /lochp?\nDisallow: /center\nDisallow: /ie?\nDisallow: /sms/demo?\nDisallow: /katrina?\nDisallow: /blogsearch?\nDisallow: /blogsearch/\nDisallow: /blogsearch_feeds\nDisallow: /advanced_blog_search\nDisallow: /reader/\nAllow: /reader/play\nDisallow: /uds/\nDisallow: /chart?\nDisallow: /transit?\nDisallow: /mbd?\nDisallow: /extern_js/\nDisallow: /calendar/feeds/\nDisallow: /calendar/ical/\nDisallow: /cl2/feeds/\n" +
"Disallow: /cl2/ical/\nDisallow: /coop/directory\nDisallow: /coop/manage\nDisallow: /trends?\nDisallow: /trends/music?\nDisallow: /trends/hottrends?\nDisallow: /trends/viz?\nDisallow: /notebook/search?\nDisallow: /musica\nDisallow: /musicad\nDisallow: /musicas\nDisallow: /musicl\nDisallow: /musics\nDisallow: /musicsearch\nDisallow: /musicsp\nDisallow: /musiclp\nDisallow: /browsersync\nDisallow: /call\nDisallow: /archivesearch?\nDisallow: /archivesearch/url\nDisallow: /archivesearch/advanced_search\nDisallow: /base/reportbadoffer\nDisallow: /urchin_test/\nDisallow: /movies?\nDisallow: /codesearch?\nDisallow: /codesearch/feeds/search?\nDisallow: /wapsearch?\nDisallow: /safebrowsing\nAllow: /safebrowsing/diagnostic\nAllow: /safebrowsing/report_error/\nAllow: /safebrowsing/report_phish/\nDisallow: /reviews/search?\nDisallow: /orkut/albums\nAllow: /jsapi\nDisallow: /views?\nDisallow: /c/\nDisallow: /cbk\nDisallow: /recharge/dashboard/car\nDisallow: /recharge/dashboard/static/\nDisallow: /translate_a/\nDisallow: /translate_c\nDisallow: /translate_f\nDisallow: /translate_static/\nDisallow: /translate_suggestion\nDisallow: /profiles/me\nAllow: /profiles\nDisallow: /s2/profiles/me\nAllow: /s2/profiles\nAllow: /s2/photos\nAllow: /s2/static\nDisallow: /s2\nDisallow: /transconsole/portal/\nDisallow: /gcc/\nDisallow: /aclk\nDisallow: /cse?\nDisallow: /cse/home\nDisallow: /cse/panel\nDisallow: /cse/manage\nDisallow: /tbproxy/\nDisallow: /imesync/\nDisallow: /shenghuo/search?\nDisallow: /support/forum/search?\nDisallow: /reviews/polls/\nDisallow: /hosted/images/\nDisallow: /ppob/?\nDisallow: /ppob?\nDisallow: /ig/add?\nDisallow: /adwordsresellers\nDisallow: /accounts/o8\nAllow: /accounts/o8/id\nDisallow: /topicsearch?q=\nDisallow: /xfx7/\nDisallow: /squared/api\nDisallow: /squared/search\nDisallow: /squared/table\nDisallow: /toolkit/\nAllow: /toolkit/*.html\nDisallow: /globalmarketfinder/\nAllow: /globalmarketfinder/*.html\nDisallow: /qnasearch?\nDisallow: /errors/\nDisallow: /app/updates\nDisallow: /sidewiki/entry/\nDisallow: /quality_form?\nDisallow: /labs/popgadget/search\nDisallow: /buzz/post\nDisallow: /compressiontest/\nDisallow: /analytics/reporting/\nDisallow: /analytics/admin/\nDisallow: /analytics/web/\nDisallow: /analytics/feeds/\nDisallow: /analytics/settings/\nDisallow: /alerts/\nDisallow: /phone/compare/?\nAllow: /alerts/manage\nSitemap: http://www.gstatic.com/s2/sitemaps/profiles-sitemap.xml\nSitemap: http://www.google.com/hostednews/sitemap_index.xml\nSitemap: http://www.google.com/ventures/sitemap_ventures.xml\nSitemap: http://www.google.com/sitemaps_webmasters.xml\nSitemap: http://www.gstatic.com/trends/websites/sitemaps/sitemapindex.xml\nSitemap: http://www.gstatic.com/dictionary/static/sitemaps/sitemap_index.xml")
func TestFromString005(t *testing.T) {
r, err := FromString(robotsText002)
if err != nil {
t.Fatal(err.Error())
}
ExpectAllowAll(t, r)
}
func TestFromString006(t *testing.T) {
r, err := FromString(robotsText002)
if err != nil {
t.Fatal(err.Error())
}
allow := r.TestAgent("/search", "SomeBot")
if allow {
t.Fatal("Must deny.")
}
}
const robotsText003 = "User-Agent: * \nAllow: /"
func TestFromString007(t *testing.T) {
r, err := FromString(robotsText003)
if err != nil {
t.Fatal(err.Error())
}
allow := r.TestAgent("/random", "SomeBot")
if !allow {
t.Fatal("Must allow.")
}
}
const robotsText004 = "User-Agent: * \nDisallow: "
func TestFromString008(t *testing.T) {
r, err := FromString(robotsText004)
if err != nil {
t.Log(robotsText004)
t.Fatal(err.Error())
}
allow := r.TestAgent("/random", "SomeBot")
if !allow {
t.Fatal("Must allow.")
}
}
const robotsText005 = `User-agent: Google
Disallow:
User-agent: *
Disallow: /`
func TestRobotstxtOrgCase1(t *testing.T) {
if r, err := FromString(robotsText005); err != nil {
t.Fatal(err.Error())
} else if allow := r.TestAgent("/path/page1.html", "SomeBot"); allow {
t.Fatal("Must disallow.")
}
}
func TestRobotstxtOrgCase2(t *testing.T) {
if r, err := FromString(robotsText005); err != nil {
t.Fatal(err.Error())
} else if allow := r.TestAgent("/path/page1.html", "Googlebot"); !allow {
t.Fatal("Must allow.")
}
}
const robotsText006 = `
Host: site.ru`
func TestRobotstxtHostCase1(t *testing.T) {
if r, err := FromString(robotsText006); err != nil {
t.Fatal(err.Error())
} else if r.Host != "site.ru" {
t.Fatal("Incorrect host detection")
}
}
const robotsText007 = `
#Host: site.ru`
func TestRobotstxtHostCase2(t *testing.T) {
if r, err := FromString(robotsText007); err != nil {
t.Fatal(err.Error())
} else if r.Host != "" {
t.Fatal("Incorrect host detection")
}
}
const robotsText008 = `
Host: яндекс.рф`
func TestRobotstxtHostCase3(t *testing.T) {
if r, err := FromString(robotsText008); err != nil {
t.Fatal(err.Error())
} else if r.Host != "яндекс.рф" {
t.Fatal("Incorrect host detection")
}
}
const robotsTextErrs = `Disallow: /
User-agent: Google
Crawl-delay: fail`
func TestParseErrors(t *testing.T) {
if _, err := FromString(robotsTextErrs); err == nil {
t.Fatal("Expected error.")
} else {
if pe, ok := err.(*ParseError); !ok {
t.Fatal("Expected ParseError.")
} else if len(pe.Errs) != 2 {
t.Fatalf("Expected 2 errors, got %d:\n%v", len(pe.Errs), pe.Errs)
}
}
}
const robotsTextJustHTML = `<!DOCTYPE html>
<html>
<title></title>
<p>Hello world! This is valid HTML but invalid robots.txt.`
func TestHtmlInstead(t *testing.T) {
r, err := FromString(robotsTextJustHTML)
if err != nil {
// According to Google spec, invalid robots.txt file
// must be parsed silently.
t.Fatal(err.Error())
}
group := r.FindGroup("SuperBot")
if group == nil {
t.Fatal("Group must not be nil.")
}
if !group.Test("/") {
t.Fatal("Must allow by default.")
}
}
// http://perche.vanityfair.it/robots.txt on Sat, 13 Sep 2014 23:00:29 GMT
const robotsTextVanityfair = "\xef\xbb\xbfUser-agent: *\nDisallow: */oroscopo-di-oggi/*"
func TestWildcardPrefix(t *testing.T) {
r, err := FromString(robotsTextVanityfair)
if err != nil {
t.Fatal(err.Error())
}
if !r.TestAgent("/foo/bar", "WildcardBot") {
t.Fatal("Must allow /foo/bar")
}
if r.TestAgent("/oroscopo-di-oggi/bar", "WildcardBot") {
t.Fatal("Must not allow /oroscopo-di-oggi/bar")
}
if r.TestAgent("/foo/oroscopo-di-oggi/bar", "WildcardBot") {
t.Fatal("Must not allow /foo/oroscopo-di-oggi/bar")
}
}
func TestGrouping(t *testing.T) {
const robotsCaseGrouping = `user-agent: a
user-agent: b
disallow: /a
disallow: /b
user-agent: ignore
Disallow: /separator
user-agent: b
user-agent: c
disallow: /b
disallow: /c`
if r, e := FromString(robotsCaseGrouping); e != nil {
t.Fatal(e)
} else {
ExpectDisallow(t, r, "/a", "a")
ExpectDisallow(t, r, "/b", "a")
ExpectAllow(t, r, "/c", "a")
ExpectDisallow(t, r, "/a", "b")
ExpectDisallow(t, r, "/b", "b")
ExpectDisallow(t, r, "/c", "b")
ExpectAllow(t, r, "/a", "c")
ExpectDisallow(t, r, "/b", "c")
ExpectDisallow(t, r, "/c", "c")
}
}
func BenchmarkParseFromString001(b *testing.B) {
for i := 0; i < b.N; i++ {
FromString(robotsText001)
b.SetBytes(int64(len(robotsText001)))
}
}
func BenchmarkParseFromString002(b *testing.B) {
for i := 0; i < b.N; i++ {
FromString(robotsText002)
b.SetBytes(int64(len(robotsText002)))
}
}
func BenchmarkParseFromStatus401(b *testing.B) {
for i := 0; i < b.N; i++ {
FromStatusAndString(401, "")
}
}