Commit 986910a2 authored by Nacim Goura's avatar Nacim Goura

fix bug crawl website

parent c2422588
......@@ -36,6 +36,7 @@
"class-methods-use-this": "off",
"no-param-reassign": "off",
"meteor/no-session": "off",
"no-console": "off"
"no-console": "off",
"no-plusplus": "off"
}
}
......@@ -23,6 +23,7 @@ export default class crawlWebsite extends CrawlGeneric {
this.urlWebsite = data.urlWebsite;
this.userId = this.config.userId;
this.config = _.find(this.config.listConfig, n => n.domain === data.nameConfig);
this.numberIndexed = 0;
return this.start();
}
......@@ -54,9 +55,19 @@ export default class crawlWebsite extends CrawlGeneric {
userAgent: [
'Mozilla/5.0 (compatible; fr-crawler/1.1)',
'Googlebot/2.1 (+http://www.google.com/bot.html)',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.110 Safari/537.36',
'Mozilla/5.0 (Linux; Android 4.2.2; Nexus 7 Build/JDQ39) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.49 Safari/537.31',
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Win64; x64; Trident/6.0)',
'Mozilla/5.0 (Windows NT 6.0) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.107 Safari/535.1',
'Mozilla/5.0 (X11; Linux x86_64; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
'Mozilla/5.0 (X11; U; Linux i686; fr; rv:1.9b5) Gecko/2008041514 Firefox/3.0b5',
'Opera/9.80 (X11; Linux x86_64; U; Ubuntu; fr) Presto/2.10.289 Version/12.01',
'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)',
],
callback: (error, res, done) => {
console.log(crawl.queueSize, res.options.uri);
if (crawl.queueSize % 1000 === 0) {
console.log(crawl.queueSize, res.options.uri);
}
if (error || res.statusCode !== 200) {
this.listDataError.push({
url: res.options.uri,
......@@ -95,13 +106,15 @@ export default class crawlWebsite extends CrawlGeneric {
const urlParse = res.options.uri.replace('https', 'http').replace(/\/$/, '');
this.parseData(res.$, urlParse, listPdf, res.headers.date)
.then(() => {
jobCollection.update({ name: this.name },
{
$inc: {
numberIndexed: 1,
if (this.numberIndexed % 100 === 0) {
jobCollection.update({ name: this.name },
{
set: {
numberIndexed: this.numberIndexed,
},
},
},
);
);
}
}).catch((err) => {
console.log(err);
});
......@@ -174,18 +187,8 @@ export default class crawlWebsite extends CrawlGeneric {
this.listDataForIndex.push(dataForIndex);
return this.contextIndex.indexOne(_index, _type, _id, dataForIndex);
this.numberIndexed++;
/* this.listDataForIndex.push({
index: {
_index: Meteor.settings.private.elasticsearch.esIndex,
_type: this.config.userId,
_id: currentUrl,
},
});
this.listDataForIndex.push(dataForIndex);
*/
return this.contextIndex.indexOne(_index, _type, _id, dataForIndex);
}
}
......@@ -19,7 +19,7 @@ Template.searchTpl.events({
const term = event.target.value;
if (term && term.length > 2) {
if (term && term.length > 2 && term.length < 20) {
Meteor.callPromise('autoCompletion', term)
.then((results) => {
Session.set('autoCompleteResults', results);
......
......@@ -70,7 +70,7 @@ const checkData = {
// url for crawl
checkCrawlUrl(url) {
return !/\.(jpeg|jpg|gif|png|js|css|ico|eot|svg|woff|ttf|zip|rar|tar|pdf|xml|xlsx|docx|mp3)/.test(url);
return !/\.(jpeg|jpg|gif|png|js|css|ico|eot|svg|woff|ttf|zip|rar|tar|pdf|xml|xlsx|docx|mp3|doc)/.test(url);
},
// clean url
......
#!/usr/bin/env bash
source ~/.nvm/nvm.sh
source ~/.profile
source ~/.bashrc
nvm use
cd ./.deploy/
pm2-meteor logs 0
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment