Skip to content

Commit

Permalink
add explicit disallow feature
Browse files Browse the repository at this point in the history
  • Loading branch information
SimonC-Audigent committed Sep 17, 2024
1 parent 21c66bf commit d5f8b28
Show file tree
Hide file tree
Showing 4 changed files with 89 additions and 9 deletions.
6 changes: 4 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ The parser currently supports:

- User-agent:
- Allow:
- Disallow:
- Disallow (with explicit mode support):
- Sitemap:
- Crawl-delay:
- Host:
Expand Down Expand Up @@ -41,6 +41,7 @@ var robots = robotsParser('http://www.example.com/robots.txt', [
robots.isAllowed('http://www.example.com/test.html', 'Sams-Bot/1.0'); // true
robots.isAllowed('http://www.example.com/dir/test.html', 'Sams-Bot/1.0'); // true
robots.isDisallowed('http://www.example.com/dir/test2.html', 'Sams-Bot/1.0'); // true
robots.isDisallowed('http://www.example.com/dir/test2.html', 'Sams-Bot/1.0', true); // false
robots.getCrawlDelay('Sams-Bot/1.0'); // 1
robots.getSitemaps(); // ['http://example.com/sitemap.xml']
robots.getPreferredHost(); // example.com
Expand All @@ -54,11 +55,12 @@ Returns true if crawling the specified URL is allowed for the specified user-age

This will return `undefined` if the URL isn't valid for this robots.txt.

### isDisallowed(url, [ua])
### isDisallowed(url, [ua], [explicit])

**boolean or undefined**

Returns true if crawling the specified URL is not allowed for the specified user-agent.
In explicit mode, user agents wildcards are discarded.

This will return `undefined` if the URL isn't valid for this robots.txt.

Expand Down
52 changes: 46 additions & 6 deletions Robots.js
Original file line number Diff line number Diff line change
Expand Up @@ -361,7 +361,7 @@ Robots.prototype.setPreferredHost = function (url) {
this._preferredHost = url;
};

Robots.prototype._getRule = function (url, ua) {
Robots.prototype._getRule = function (url, ua, explicit) {
var parsedUrl = parseUrl(url) || {};
var userAgent = formatUserAgent(ua || '*');

Expand All @@ -374,7 +374,12 @@ Robots.prototype._getRule = function (url, ua) {
return;
}

var rules = this._rules[userAgent] || this._rules['*'] || [];
var rules = this._rules[userAgent];
if (!explicit) {
rules = rules || this._rules['*']
}
rules = rules || []

var path = urlEncodeToUpper(parsedUrl.pathname + parsedUrl.search);
var rule = findRule(path, rules);

Expand Down Expand Up @@ -422,16 +427,51 @@ Robots.prototype.getMatchingLineNumber = function (url, ua) {
};

/**
* Returns the opposite of isAllowed()
*
* In standard mode, it returns the opposite of is allowed().
* In explicit mode, it will return:
* - true if the the agent is explicitly disallowed (wildcard non included),
* - throws an error if the user agent is not specified,
* - and false otherwise.
* @param {string} url
* @param {string} ua
* @return {boolean}
*/
Robots.prototype.isDisallowed = function (url, ua) {
return !this.isAllowed(url, ua);
Robots.prototype.isDisallowed = function (url, ua, explicit) {
if ((explicit === true) && (ua === undefined)) {
throw new Error("User Agent must be specified in explicit mode")
}

var rule = this._getRule(url, ua, explicit);
if (typeof rule === 'undefined') {
return true;
}
return !(!rule || rule.allow);
};

Robots.prototype.isExplicitlyDisallowed = function(url, ua) {
var parsedUrl = parseUrl(url) || {};
var userAgent = formatUserAgent(ua);

// The base URL must match otherwise this robots.txt is not valid for it.
if (
parsedUrl.protocol !== this._url.protocol ||
parsedUrl.hostname !== this._url.hostname ||
parsedUrl.port !== this._url.port
) {
return;
}

var rules = this._rules[userAgent] || [];
var path = urlEncodeToUpper(parsedUrl.pathname + parsedUrl.search);
var rule = findRule(path, rules);

if (typeof rule === 'undefined') {
return;
}

return !(!rule || rule.allow);
}

/**
* Gets the crawl delay if there is one.
*
Expand Down
2 changes: 1 addition & 1 deletion index.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ declare module 'robots-parser';

interface Robot {
isAllowed(url: string, ua?: string): boolean | undefined;
isDisallowed(url: string, ua?: string): boolean | undefined;
isDisallowed(url: string, ua?: string, explicit?: boolean): boolean | undefined;
getMatchingLineNumber(url: string, ua?: string): number;
getCrawlDelay(ua?: string): number | undefined;
getSitemaps(): string[];
Expand Down
38 changes: 38 additions & 0 deletions test/Robots.js
Original file line number Diff line number Diff line change
Expand Up @@ -861,4 +861,42 @@ describe('Robots', function () {

testRobots('https://www.example.com/robots.txt', contents, allowed, disallowed);
});

it('should not be disallowed when wildcard is used in explicit mode', function () {
var contents = [
'User-agent: *',
'Disallow: /',
].join('\n')

var url = 'https://www.example.com/hello'
var userAgent = 'SomeBot';
var robots = robotsParser(url, contents);

expect(robots.isDisallowed(url, userAgent, true)).to.equal(false)
})

it('should be disallowed when user agent equal robots rule in explicit mode', function () {
var contents = [
'User-agent: SomeBot',
'Disallow: /',
].join('\n')

var url = 'https://www.example.com/hello'
var userAgent = 'SomeBot';
var robots = robotsParser(url, contents);

expect(robots.isDisallowed(url, userAgent, true)).to.equal(true)
})

it('should throw an error when user agent is not set in explicit mode', function () {
var contents = [
'User-agent: SomeBot',
'Disallow: /',
].join('\n')

var url = 'https://www.example.com/hello'
var robots = robotsParser(url, contents);

expect(robots.isDisallowed.bind(robots, url, undefined, true)).to.throw("User Agent must be specified in explicit mode")
})
});

0 comments on commit d5f8b28

Please sign in to comment.