Commit 81979ffb authored by Nacim Goura's avatar Nacim Goura

create class for crawl

parent fb5996e6
import { Meteor } from 'meteor/meteor';
import Crawler from 'simplecrawler';
import _ from 'lodash';
import cheerio from 'cheerio';
import checkData from '../../utils/checkData';
export default class crawlWebsite {
constructor(url) {
this.urls = [];
this.crawler = new Crawler(url);
this.crawler.on('fetchcomplete', (queueItem) => {
const url = queueItem.url;
if (checkData.checkURL(url)) {
console.log(url);
this.urls.push(url);
}
});
this.crawler.on('complete', () => {
console.log(`${this.urls.length} parsé`);
});
this.crawler.maxDepth = 3;
this.crawler.start();
}
parseData($, url) {
}
}
......@@ -4,6 +4,7 @@ import Sitemapper from 'sitemapper';
import Crawler from 'crawler';
import checkData from '../../utils/checkData';
import IndexGeneric from './indexGeneric';
import CrawlWebsite from '../crawl/crawlWebsite';
const sitemap = new Sitemapper();
......@@ -15,9 +16,11 @@ export default class IndexWebsite extends IndexGeneric {
* @returns Promise
*/
async initIndexation(url) {
let urls = [url];
const crawl = new CrawlWebsite(url);
/* let urls = [url];
// si c'est un sitemap on récupère ces url
if (checkData.isSitemap(url)) {
if (checkData.isSitemap(url)) {
const { sites } = await sitemap.fetch(url);
urls = sites;
}
......@@ -25,7 +28,7 @@ export default class IndexWebsite extends IndexGeneric {
.then(dataToIndex => this.indexByBulk(dataToIndex))
.catch((error) => {
throw error;
});
});*/
}
/**
......@@ -105,7 +108,7 @@ export default class IndexWebsite extends IndexGeneric {
index: {
_index: Meteor.settings.private.elasticsearch.esIndex,
_type: Meteor.settings.private.elasticsearch.esType,
_id: decodeURI(url),
_id: url,
},
});
......
......@@ -8,13 +8,19 @@ Meteor.methods({
addTest(test) {
check(test, Object);
test.urlExpected = _.replace(test.urlExpected, 'https', 'http');
const results = Meteor.call('searchByTerm', test.term);
_.forEach(results.list, (result, index) => {
if (test.urlExpected === result.url) {
test.urlPosition = index + 1;
Meteor.call('searchByTerm', test.term, Meteor.bindEnvironment((err, results) => {
if (err) {
console.log(err);
throw new Meteor.Error('Error', 'Erreur lors de la recherche');
} else {
_.forEach(results.list, (result, index) => {
if (test.urlExpected === result.url) {
test.urlPosition = index + 1;
}
});
testSearchCollection.insert(test);
}
});
testSearchCollection.insert(test);
}));
},
deleteTest(id) {
check(id, String);
......
......@@ -16,9 +16,7 @@ Template.testSearchTpl.events({
};
Meteor.callPromise('addTest', test)
.then((results) => {
console.log(results);
}).catch((error) => {
.catch((error) => {
console.log(error);
});
},
......@@ -27,9 +25,7 @@ Template.testSearchTpl.events({
Template.testSearchActionTable.events({
'click .delete-test-search': function () {
Meteor.callPromise('deleteTest', this._id)
.then((results) => {
console.log(results);
}).catch((error) => {
.catch((error) => {
console.log(error);
});
},
......
......@@ -52,6 +52,11 @@ const checkData = {
allowProtocolRelative: true,
});
},
// url pour crawler
checkURL(url) {
return !url.match(/\.(jpeg|jpg|gif|png|js|css|ico|eot|svg|woff|ttf)/);
},
};
export default checkData;
......@@ -6,6 +6,10 @@
version "1.0.5"
resolved "https://registry.yarnpkg.com/@meteorjs/eslint-config-meteor/-/eslint-config-meteor-1.0.5.tgz#88c14dfbb4a2fa2258fff4889ccd8ddd0eccb04b"
"@types/node@^6.0.46":
version "6.0.78"
resolved "https://registry.yarnpkg.com/@types/node/-/node-6.0.78.tgz#5d4a3f579c1524e01ee21bf474e6fba09198f470"
Base64@~0.2.0:
version "0.2.1"
resolved "https://registry.yarnpkg.com/Base64/-/Base64-0.2.1.tgz#ba3a4230708e186705065e66babdd4c35cf60028"
......@@ -145,6 +149,12 @@ async@^1.4.0:
version "1.5.2"
resolved "https://registry.yarnpkg.com/async/-/async-1.5.2.tgz#ec6a61ae56480c0c3cb241c95618e20892f9672a"
async@^2.1.4:
version "2.4.1"
resolved "https://registry.yarnpkg.com/async/-/async-2.4.1.tgz#62a56b279c98a11d0987096a01cc3eeb8eb7bbd7"
dependencies:
lodash "^4.14.0"
asynckit@^0.4.0:
version "0.4.0"
resolved "https://registry.yarnpkg.com/asynckit/-/asynckit-0.4.0.tgz#c79ed97f7f34cb8f2ba1bc9790bcc366474b4b79"
......@@ -544,6 +554,17 @@ cheerio@^0.22.0:
lodash.reject "^4.4.0"
lodash.some "^4.4.0"
cheerio@^1.0.0-rc.1:
version "1.0.0-rc.1"
resolved "https://registry.yarnpkg.com/cheerio/-/cheerio-1.0.0-rc.1.tgz#2af37339eab713ef6b72cde98cefa672b87641fe"
dependencies:
css-select "~1.2.0"
dom-serializer "~0.1.0"
entities "~1.1.1"
htmlparser2 "^3.9.1"
lodash "^4.15.0"
parse5 "^3.0.1"
cipher-base@^1.0.0, cipher-base@^1.0.1, cipher-base@^1.0.3:
version "1.0.3"
resolved "https://registry.yarnpkg.com/cipher-base/-/cipher-base-1.0.3.tgz#eeabf194419ce900da3018c207d212f2a6df0a07"
......@@ -1523,7 +1544,7 @@ https-browserify@0.0.1:
version "0.0.1"
resolved "https://registry.yarnpkg.com/https-browserify/-/https-browserify-0.0.1.tgz#3f91365cabe60b77ed0ebba24b454e3e09d95a82"
iconv-lite@^0.4.8:
iconv-lite@^0.4.13, iconv-lite@^0.4.8:
version "0.4.17"
resolved "https://registry.yarnpkg.com/iconv-lite/-/iconv-lite-0.4.17.tgz#4fdaa3b38acbc2c031b045d0edcdfe1ecab18c8d"
......@@ -1880,7 +1901,7 @@ lodash@^3.6.0:
version "3.10.1"
resolved "https://registry.yarnpkg.com/lodash/-/lodash-3.10.1.tgz#5bf45e8e49ba4189e17d482789dfd15bd140b7b6"
lodash@^4.0.0, lodash@^4.13.1, lodash@^4.17.4, lodash@^4.2.0, lodash@^4.3.0, lodash@^4.5.1:
lodash@^4.0.0, lodash@^4.13.1, lodash@^4.14.0, lodash@^4.15.0, lodash@^4.17.4, lodash@^4.2.0, lodash@^4.3.0, lodash@^4.5.1:
version "4.17.4"
resolved "https://registry.yarnpkg.com/lodash/-/lodash-4.17.4.tgz#78203a4d1c328ae1d86dca6460e369b57f4055ae"
......@@ -2158,6 +2179,12 @@ parse-json@^2.2.0:
dependencies:
error-ex "^1.2.0"
parse5@^3.0.1:
version "3.0.2"
resolved "https://registry.yarnpkg.com/parse5/-/parse5-3.0.2.tgz#05eff57f0ef4577fb144a79f8b9a967a6cc44510"
dependencies:
"@types/node" "^6.0.46"
parseurl@~1.3.1:
version "1.3.1"
resolved "https://registry.yarnpkg.com/parseurl/-/parseurl-1.3.1.tgz#c8ab8c9223ba34888aa64a297b28853bec18da56"
......@@ -2472,6 +2499,10 @@ ripemd160@^2.0.0, ripemd160@^2.0.1:
hash-base "^2.0.0"
inherits "^2.0.1"
robots-parser@^1.0.0:
version "1.0.1"
resolved "https://registry.yarnpkg.com/robots-parser/-/robots-parser-1.0.1.tgz#79e32e3e5cb4066de501404207a8eb1232970020"
rsvp@^3.0.14, rsvp@^3.0.17:
version "3.5.0"
resolved "https://registry.yarnpkg.com/rsvp/-/rsvp-3.5.0.tgz#a62c573a4ae4e1dfd0697ebc6242e79c681eaa34"
......@@ -2547,6 +2578,15 @@ simpl-schema@^0.3.0:
mongo-object "0.0.1"
underscore "1.8.3"
simplecrawler@^1.1.3:
version "1.1.3"
resolved "https://registry.yarnpkg.com/simplecrawler/-/simplecrawler-1.1.3.tgz#e9cfd35f0aa1a9e0a91768c47e6c9b4da405862a"
dependencies:
async "^2.1.4"
iconv-lite "^0.4.13"
robots-parser "^1.0.0"
urijs "^1.16.1"
sitemapper@^2.1.13:
version "2.1.13"
resolved "https://registry.yarnpkg.com/sitemapper/-/sitemapper-2.1.13.tgz#1d47b0c57db6c2fcf25721025377ab6c5a191ce2"
......@@ -2822,6 +2862,10 @@ unpipe@~1.0.0:
version "1.0.0"
resolved "https://registry.yarnpkg.com/unpipe/-/unpipe-1.0.0.tgz#b2bf4ee8514aae6165b4817829d21b2ef49904ec"
urijs@^1.16.1:
version "1.18.10"
resolved "https://registry.yarnpkg.com/urijs/-/urijs-1.18.10.tgz#b94463eaba59a1a796036a467bb633c667f221ab"
url@^0.11.0:
version "0.11.0"
resolved "https://registry.yarnpkg.com/url/-/url-0.11.0.tgz#3838e97cfc60521eb73c525a8e55bfdd9e2e28f1"
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment