Commit 986910a2 authored by Nacim Goura's avatar Nacim Goura

fix bug crawl website

parent c2422588
...@@ -36,6 +36,7 @@ ...@@ -36,6 +36,7 @@
"class-methods-use-this": "off", "class-methods-use-this": "off",
"no-param-reassign": "off", "no-param-reassign": "off",
"meteor/no-session": "off", "meteor/no-session": "off",
"no-console": "off" "no-console": "off",
"no-plusplus": "off"
} }
} }
...@@ -23,6 +23,7 @@ export default class crawlWebsite extends CrawlGeneric { ...@@ -23,6 +23,7 @@ export default class crawlWebsite extends CrawlGeneric {
this.urlWebsite = data.urlWebsite; this.urlWebsite = data.urlWebsite;
this.userId = this.config.userId; this.userId = this.config.userId;
this.config = _.find(this.config.listConfig, n => n.domain === data.nameConfig); this.config = _.find(this.config.listConfig, n => n.domain === data.nameConfig);
this.numberIndexed = 0;
return this.start(); return this.start();
} }
...@@ -54,9 +55,19 @@ export default class crawlWebsite extends CrawlGeneric { ...@@ -54,9 +55,19 @@ export default class crawlWebsite extends CrawlGeneric {
userAgent: [ userAgent: [
'Mozilla/5.0 (compatible; fr-crawler/1.1)', 'Mozilla/5.0 (compatible; fr-crawler/1.1)',
'Googlebot/2.1 (+http://www.google.com/bot.html)', 'Googlebot/2.1 (+http://www.google.com/bot.html)',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.110 Safari/537.36',
'Mozilla/5.0 (Linux; Android 4.2.2; Nexus 7 Build/JDQ39) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.49 Safari/537.31',
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Win64; x64; Trident/6.0)',
'Mozilla/5.0 (Windows NT 6.0) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.107 Safari/535.1',
'Mozilla/5.0 (X11; Linux x86_64; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
'Mozilla/5.0 (X11; U; Linux i686; fr; rv:1.9b5) Gecko/2008041514 Firefox/3.0b5',
'Opera/9.80 (X11; Linux x86_64; U; Ubuntu; fr) Presto/2.10.289 Version/12.01',
'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)',
], ],
callback: (error, res, done) => { callback: (error, res, done) => {
console.log(crawl.queueSize, res.options.uri); if (crawl.queueSize % 1000 === 0) {
console.log(crawl.queueSize, res.options.uri);
}
if (error || res.statusCode !== 200) { if (error || res.statusCode !== 200) {
this.listDataError.push({ this.listDataError.push({
url: res.options.uri, url: res.options.uri,
...@@ -95,13 +106,15 @@ export default class crawlWebsite extends CrawlGeneric { ...@@ -95,13 +106,15 @@ export default class crawlWebsite extends CrawlGeneric {
const urlParse = res.options.uri.replace('https', 'http').replace(/\/$/, ''); const urlParse = res.options.uri.replace('https', 'http').replace(/\/$/, '');
this.parseData(res.$, urlParse, listPdf, res.headers.date) this.parseData(res.$, urlParse, listPdf, res.headers.date)
.then(() => { .then(() => {
jobCollection.update({ name: this.name }, if (this.numberIndexed % 100 === 0) {
{ jobCollection.update({ name: this.name },
$inc: { {
numberIndexed: 1, set: {
numberIndexed: this.numberIndexed,
},
}, },
}, );
); }
}).catch((err) => { }).catch((err) => {
console.log(err); console.log(err);
}); });
...@@ -174,18 +187,8 @@ export default class crawlWebsite extends CrawlGeneric { ...@@ -174,18 +187,8 @@ export default class crawlWebsite extends CrawlGeneric {
this.listDataForIndex.push(dataForIndex); this.listDataForIndex.push(dataForIndex);
return this.contextIndex.indexOne(_index, _type, _id, dataForIndex); this.numberIndexed++;
return this.contextIndex.indexOne(_index, _type, _id, dataForIndex);
/* this.listDataForIndex.push({
index: {
_index: Meteor.settings.private.elasticsearch.esIndex,
_type: this.config.userId,
_id: currentUrl,
},
});
this.listDataForIndex.push(dataForIndex);
*/
} }
} }
...@@ -19,7 +19,7 @@ Template.searchTpl.events({ ...@@ -19,7 +19,7 @@ Template.searchTpl.events({
const term = event.target.value; const term = event.target.value;
if (term && term.length > 2) { if (term && term.length > 2 && term.length < 20) {
Meteor.callPromise('autoCompletion', term) Meteor.callPromise('autoCompletion', term)
.then((results) => { .then((results) => {
Session.set('autoCompleteResults', results); Session.set('autoCompleteResults', results);
......
...@@ -70,7 +70,7 @@ const checkData = { ...@@ -70,7 +70,7 @@ const checkData = {
// url for crawl // url for crawl
checkCrawlUrl(url) { checkCrawlUrl(url) {
return !/\.(jpeg|jpg|gif|png|js|css|ico|eot|svg|woff|ttf|zip|rar|tar|pdf|xml|xlsx|docx|mp3)/.test(url); return !/\.(jpeg|jpg|gif|png|js|css|ico|eot|svg|woff|ttf|zip|rar|tar|pdf|xml|xlsx|docx|mp3|doc)/.test(url);
}, },
// clean url // clean url
......
#!/usr/bin/env bash
source ~/.nvm/nvm.sh
source ~/.profile
source ~/.bashrc
nvm use
cd ./.deploy/
pm2-meteor logs 0
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment