Skip to content

Commit

Permalink
Merge pull request #3 from masterT/params-schema
Browse files Browse the repository at this point in the history
Params schema
  • Loading branch information
masterT authored Feb 3, 2017
2 parents 2a3157f + baf014c commit 2a9130f
Show file tree
Hide file tree
Showing 5 changed files with 185 additions and 41 deletions.
62 changes: 30 additions & 32 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -96,48 +96,22 @@ var options = {
var scraper = yoloScraper.createScraper(options);
```

#### Returned scraper function
#### `options.paramsSchema`

To use your scraper function, pass the params of your scraping request, and a callback function.

```js
scraper(params, function (error, data) {
if (error) {
// handle the `error`
} else {
// do something with `data`
}
});
```

When a request error occurred, the callback `error` argument will be an instance of _Error_ and the `data` will be _null_.

##### Case `options.validateList = false`

When an validation error occurred, the callback `error` argument will be an instance of _ValidationError_ and the `data` will be _null_.

Otherwise, the `error` will be _null_ and `data` will be the returned value of `options.extract`.


##### Case `options.validateList = true`

When an validation errors occurred, the callback `error` argument will be an instance of _ListValidationError_, otherwise it will be _null_.

If the value returned by `options.extract` is not an Array, `error` will be an instance of _Error_.

The `data` always be an _Array_ that only contains the **valid** item returned by `options.extract`.
The [JSON schema](https://spacetelescope.github.io/understanding-json-schema/) that defines the shape of the accepted arguments passed to `options.request`. When invalid, an Error will be thrown.

Optional

#### `options.request = function(params)`

Function that takes the *same argument* passed to your scraper function. It returns the options to pass to the [request ](https://www.npmjs.com/package/request) module to make the request.
Function that takes the arguments passed to your scraper function and returns the options to pass to the [request ](https://www.npmjs.com/package/request) module to make the network request.

**Required**


#### `options.extract = function(response, body, $)`

Function that takes [request](https://www.npmjs.com/package/request) response, the response body and a [cheerio](https://www.npmjs.com/package/cheerio) instance. It returns the extracted data you want.
Function that takes [request](https://www.npmjs.com/package/request) response, the response body (_String_) and a [cheerio](https://www.npmjs.com/package/cheerio) instance. It returns the extracted data you want.

**Required**

Expand All @@ -158,7 +132,7 @@ Optional, default: `{}`

#### `options.ajvOptions`

The option to pass to [ajv](https://www.npmjs.com/package/ajv) when it compiles the schema.
The option to pass to [ajv](https://www.npmjs.com/package/ajv) when it compiles the JSON schemas.

Optional, default: `{allErrors: true}` - It check all rules collecting all errors

Expand All @@ -170,6 +144,30 @@ Use this option to validate each item of the data extracted **individually**. Wh
Optional, default: `false`


#### scraper function

To use your scraper function, pass the params to send to `options.request`, and a callback function.

```js
scraper(params, function (error, data) {
if (error) {
// handle the `error`
} else {
// do something with `data`
}
});
```

##### callback(error, data)

- When a network request error occurred, the callback `error` argument will be an _Error_ and the `data` will be _null_.

- When `options.validateList = false` and a validation error occurred, `error` will be a _ValidationError_ and the `data` will be _null_. Otherwise, the `error` will be _null_ and `data` will be the returned value of `options.extract`.

- When `options.validateList = true` and a validation errors occurred, `error` will be a _ListValidationError_, otherwise it will be _null_. If the value returned by `options.extract` is not an Array, `error` will be an instance of _Error_. The `data` always be an _Array_ that only contains the **valid** item returned by `options.extract`. It's not because `error` is a _ListValidationError_ that there will be no `data`!



## dependecies

- [request](https://www.npmjs.com/package/request) - Simplified HTTP request client.
Expand Down
54 changes: 54 additions & 0 deletions examples/usingParamsSchema.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
var yoloScraper = require('../lib/index.js');


var scraper = yoloScraper.createScraper({

paramsSchema: {
"$schema": "http://json-schema.org/draft-04/schema#",
"type": "string",
"minLength": 1
},

request: function (username) {
return 'https://www.npmjs.com/~' + username.toLowerCase();
},

extract: function (response, body, $) {
return $('.collaborated-packages li').toArray().map(function (element) {
var $element = $(element);
return {
name: $element.find('a').text(),
url: $element.find('a').attr('href'),
version: $element.find('strong').text()
};
});
},

schema: {
"$schema": "http://json-schema.org/draft-04/schema#",
"type" : "array",
"items": {
"type": "object",
"additionalProperties": false,
"properties": {
"name": { "type": "string" },
"url": { "type": "string", "format": "uri" },
"version": { "type": "string", "pattern": "^v\\d+\\.\\d+\\.\\d+$" }
},
"required": [ "name", "url", "version" ]
}
}

});

var validParams = "masterT";
var invalidParams = "";

scraper(validParams, function (error, data) {
// scraper(invalidParams, function (error, data) {
if (error) {
console.log('error:', error);
} else {
console.log('data:', data);
}
});
48 changes: 40 additions & 8 deletions lib/createScraper.js
Original file line number Diff line number Diff line change
Expand Up @@ -5,41 +5,73 @@ var cheerio = require('cheerio'),
ListValidationError = require('./ListValidationError.js');


function isObject(value) {
return typeof value === 'object' && value !== null && !Array.isArray(value);
}


function isFunction(value) {
return typeof value === 'function';
}


function isArray(value) {
return Array.isArray(value);
}


function isBoolean(value) {
return typeof value === 'boolean';
}


module.exports = function (options) {

if (typeof options.request !== 'function') {
if (!isFunction(options.request)) {
throw new Error("Expect options.request to be a function");
}
if (typeof options.extract !== 'function') {
if (!isFunction(options.extract)) {
throw new Error("Expect options.extract to be a function");
}
if (typeof options.schema !== 'object') {
if (!isObject(options.schema)) {
throw new Error("Expect options.schema to be an object");
}
if (options.hasOwnProperty('validateList') && typeof options.validateList !== 'boolean') {
if (options.hasOwnProperty('validateList') && !isBoolean(options.validateList)) {
throw new Error("Expect options.validateList to be a boolean");
}
if (options.hasOwnProperty('paramsSchema') && !isObject(options.paramsSchema)) {
throw new Error("Expect options.paramsSchema to be an object");
}

var cheerioOptions = {};
if (typeof options.cheerioOptions === 'object') {
if (isObject(options.cheerioOptions)) {
cheerioOptions = options.cheerioOptions;
}

var ajvOptions = {allErrors: true};
if (typeof options.ajvOptions === 'object') {
if (isObject(options.ajvOptions)) {
ajvOptions = options.ajvOptions;
}

// compile the JSON schema
var ajv = new Ajv(ajvOptions);
var validateParamsSchema;
var validateSchema = ajv.compile(options.schema);
if (options.paramsSchema) {
validateParamsSchema = ajv.compile(options.paramsSchema);
}

return function (params, callback) {

if (typeof callback !== 'function') {
if (!isFunction(callback)) {
throw new Error("Expect callback to be a function");
}

if (validateParamsSchema && !validateParamsSchema(params)) {
var paramsError = ajv.errorsText(validateParamsSchema.errors, {dataVar: 'params'});
throw new Error(paramsError);
}

var requestOption = options.request(params);

request(requestOption, function (error, response, body) {
Expand All @@ -55,7 +87,7 @@ module.exports = function (options) {
if (options.validateList) {
var validationErrors = [];
var validItems = [];
if (!Array.isArray(extractedData)) {
if (!isArray(extractedData)) {
callbackError = new Error('Expect the extracted data to be an array when using options.validateList');
} else {
extractedData.forEach(function (item) {
Expand Down
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "yolo-scraper",
"version": "0.1.0",
"version": "0.2.0",
"description": "A simple way to structure your web scraper.",
"main": "lib/index.js",
"keywords": [
Expand Down
60 changes: 60 additions & 0 deletions spec/yoloScraperSpec.js
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,16 @@ describe("createScraper", function () {
}).toThrowError(Error, "Expect options.request to be a function");
});

it("throws an error when property `request` is not a function", function () {
expect(function () {
createScraper({
request: null,
extract: function () {},
schema: {}
});
}).toThrowError(Error, "Expect options.request to be a function");
});

it("throws an error without function property `extract`", function () {
expect(function () {
createScraper({
Expand All @@ -32,6 +42,16 @@ describe("createScraper", function () {
}).toThrowError(Error, "Expect options.extract to be a function");
});

it("throws an error when property `extract` is not a function", function () {
expect(function () {
createScraper({
request: function () {},
extract: null,
schema: {}
});
}).toThrowError(Error, "Expect options.extract to be a function");
});

it("throws an error without function property `schema`", function () {
expect(function () {
createScraper({
Expand All @@ -41,6 +61,16 @@ describe("createScraper", function () {
}).toThrowError(Error, "Expect options.schema to be an object");
});

it("throws an error when property `schema` is not a boolean", function () {
expect(function () {
createScraper({
request: function () {},
extract: function () {},
schema: null
});
}).toThrowError(Error, "Expect options.schema to be an object");
});

it("throws an error when property `validateList` is not a boolean", function () {
expect(function () {
createScraper({
Expand All @@ -52,6 +82,17 @@ describe("createScraper", function () {
}).toThrowError(Error, "Expect options.validateList to be a boolean");
});

it("throws an error when property `paramsSchema` is not an object", function () {
expect(function () {
createScraper({
request: function () {},
extract: function () {},
schema: {},
paramsSchema: null
});
}).toThrowError(Error, "Expect options.paramsSchema to be an object");
});

it("returns a function with properties `request`, `extract` and `schema`, and without `validateList`", function () {
var scraper = createScraper({
request: function () {},
Expand All @@ -70,6 +111,25 @@ describe("createScraper", function () {
.pend("Don't know how to: mock request module and expect it to receive options.cheerioOptions");


describe("when using paramsSchema", function () {

it("validate the params", function () {
var options = scraperOptions();
options.paramsSchema = {
"type": "string",
"minLength": 1
};
var invalidParams = "";
var scraper = createScraper(options);

expect(function () {
scraper(invalidParams, function(error, data) {});
}).toThrowError(Error, /params/)
});

});


describe("when validateList is false", function () {
var requestBody = fixture("list.html"),
params = 'numbers',
Expand Down

0 comments on commit 2a9130f

Please sign in to comment.