Commit 9d8e0551 authored by Nacim Goura's avatar Nacim Goura

fix bug cron and add exclude element

parent fc0d0dbf
...@@ -45,6 +45,11 @@ SimpleSchema.configCollection = new SimpleSchema({ ...@@ -45,6 +45,11 @@ SimpleSchema.configCollection = new SimpleSchema({
label: 'Element du breadcrumb', label: 'Element du breadcrumb',
required: false, required: false,
}, },
'listConfig.$.excludeElement': {
type: String,
label: 'Element à exclure',
required: false,
},
'listConfig.$.forbiddenWordString': { 'listConfig.$.forbiddenWordString': {
type: String, type: String,
label: 'Mot non indexable (à séparer par une virgule)', label: 'Mot non indexable (à séparer par une virgule)',
......
...@@ -21,7 +21,8 @@ export default class crawlWebsite extends CrawlGeneric { ...@@ -21,7 +21,8 @@ export default class crawlWebsite extends CrawlGeneric {
this.name = data.name; this.name = data.name;
this.contextIndex = contextIndex; this.contextIndex = contextIndex;
this.urlWebsite = data.urlWebsite; this.urlWebsite = data.urlWebsite;
this.config.crawl = _.find(this.config.listConfig, n => n.domain === data.nameConfig); this.userId = this.config.userId;
this.config = _.find(this.config.listConfig, n => n.domain === data.nameConfig);
return this.start(); return this.start();
} }
...@@ -63,15 +64,15 @@ export default class crawlWebsite extends CrawlGeneric { ...@@ -63,15 +64,15 @@ export default class crawlWebsite extends CrawlGeneric {
if (urlHref) { if (urlHref) {
let toQueueUrl = url.resolve(res.request.uri.href, res.$(a).attr('href')); let toQueueUrl = url.resolve(res.request.uri.href, res.$(a).attr('href'));
// check if same domain name // check if same domain name
toQueueUrl = toQueueUrl.replace('https', 'http'); toQueueUrl = toQueueUrl.replace('https', 'http').replace(/\/$/, '');
// console.log(toQueueUrl, toQueueUrl.includes(this.config.crawl.domain)); // console.log(toQueueUrl, toQueueUrl.includes(this.config.domain));
if (toQueueUrl.includes(this.config.crawl.domain)) { if (toQueueUrl.includes(this.config.domain)) {
// clean url // clean url
// toQueueUrl = checkData.cleanUrl(toQueueUrl); // toQueueUrl = checkData.cleanUrl(toQueueUrl);
// check if already visited // check if already visited
if (!this.listUrlAlreadyVisited.includes(toQueueUrl)) { if (!this.listUrlAlreadyVisited.includes(toQueueUrl)) {
// check if url has forbidden word // check if url has forbidden word
if (this.config.crawl.forbiddenWord.length === 0 || !new RegExp(this.config.crawl.forbiddenWord.join('|')).test(toQueueUrl)) { if (this.config.forbiddenWord.length === 0 || !new RegExp(this.config.forbiddenWord.join('|')).test(toQueueUrl)) {
// check if url is good for crawl // check if url is good for crawl
if (checkData.checkCrawlUrl(toQueueUrl)) { if (checkData.checkCrawlUrl(toQueueUrl)) {
// add url in already visited urls // add url in already visited urls
...@@ -99,6 +100,8 @@ export default class crawlWebsite extends CrawlGeneric { ...@@ -99,6 +100,8 @@ export default class crawlWebsite extends CrawlGeneric {
}, },
}, },
); );
}).catch((err) => {
console.log(err);
}); });
} }
} }
...@@ -123,11 +126,14 @@ export default class crawlWebsite extends CrawlGeneric { ...@@ -123,11 +126,14 @@ export default class crawlWebsite extends CrawlGeneric {
*/ */
async parseData($, currentUrl, listPdf) { async parseData($, currentUrl, listPdf) {
const body = $('body'); const body = $('body');
if (this.config.excludeElement) {
$(this.config.excludeElement).remove();
}
body.html(checkData.cleanHtml(body.html())); body.html(checkData.cleanHtml(body.html()));
const title = checkData.cleanText($('title').text()); const title = checkData.cleanText($('title').text());
const dataForIndex = { const dataForIndex = {
tag: 'website', tag: 'website',
domain: this.config.crawl.domain, domain: this.config.domain,
title, title,
title_suggest: { title_suggest: {
input: title, input: title,
...@@ -140,8 +146,8 @@ export default class crawlWebsite extends CrawlGeneric { ...@@ -140,8 +146,8 @@ export default class crawlWebsite extends CrawlGeneric {
createdAt: new Date(), createdAt: new Date(),
}; };
if ($(this.config.crawl.breadcrumb).text().length) { if ($(this.config.breadcrumb).text().length) {
dataForIndex.breadcrumb = checkData.cleanText($(this.config.crawl.breadcrumb).text()); dataForIndex.breadcrumb = checkData.cleanText($(this.config.breadcrumb).text());
} }
if ($('h1').text().length) { if ($('h1').text().length) {
...@@ -156,7 +162,7 @@ export default class crawlWebsite extends CrawlGeneric { ...@@ -156,7 +162,7 @@ export default class crawlWebsite extends CrawlGeneric {
dataForIndex.listPdf = listPdf.join(' '); dataForIndex.listPdf = listPdf.join(' ');
} }
const _index = this.config.userId; const _index = this.userId;
const _type = 'website'; const _type = 'website';
const _id = currentUrl; const _id = currentUrl;
......
...@@ -12,7 +12,11 @@ SyncedCron.add({ ...@@ -12,7 +12,11 @@ SyncedCron.add({
return parser.text('every 1 day'); return parser.text('every 1 day');
}, },
job() { job() {
notifsCollection.remove({ createdAt: moment().subtract(1, 'days').toDate() }); notifsCollection.remove({
createdAt: {
$lt: moment().subtract(1, 'd').toDate(),
},
});
}, },
}); });
......
...@@ -8,9 +8,6 @@ new Tabular.Table({ ...@@ -8,9 +8,6 @@ new Tabular.Table({
name: 'testSearch', name: 'testSearch',
collection: testSearchCollection, collection: testSearchCollection,
columns: [ columns: [
{
data: 'userId', title: 'userId',
},
{ {
data: 'searchTerm', title: 'Terme recherché', data: 'searchTerm', title: 'Terme recherché',
}, },
...@@ -38,6 +35,9 @@ new Tabular.Table({ ...@@ -38,6 +35,9 @@ new Tabular.Table({
tmpl: Meteor.isClient && Template.testSearchActionTable, tmpl: Meteor.isClient && Template.testSearchActionTable,
}, },
], ],
selector() {
return { userId: Meteor.userId() };
},
pageLength: 20, pageLength: 20,
language: { language: {
sProcessing: 'Traitement en cours...', sProcessing: 'Traitement en cours...',
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment