Commit 9d8e0551 authored by Nacim Goura's avatar Nacim Goura

fix bug cron and add exclude element

parent fc0d0dbf
......@@ -45,6 +45,11 @@ SimpleSchema.configCollection = new SimpleSchema({
label: 'Element du breadcrumb',
required: false,
},
'listConfig.$.excludeElement': {
type: String,
label: 'Element à exclure',
required: false,
},
'listConfig.$.forbiddenWordString': {
type: String,
label: 'Mot non indexable (à séparer par une virgule)',
......
......@@ -21,7 +21,8 @@ export default class crawlWebsite extends CrawlGeneric {
this.name = data.name;
this.contextIndex = contextIndex;
this.urlWebsite = data.urlWebsite;
this.config.crawl = _.find(this.config.listConfig, n => n.domain === data.nameConfig);
this.userId = this.config.userId;
this.config = _.find(this.config.listConfig, n => n.domain === data.nameConfig);
return this.start();
}
......@@ -63,15 +64,15 @@ export default class crawlWebsite extends CrawlGeneric {
if (urlHref) {
let toQueueUrl = url.resolve(res.request.uri.href, res.$(a).attr('href'));
// check if same domain name
toQueueUrl = toQueueUrl.replace('https', 'http');
// console.log(toQueueUrl, toQueueUrl.includes(this.config.crawl.domain));
if (toQueueUrl.includes(this.config.crawl.domain)) {
toQueueUrl = toQueueUrl.replace('https', 'http').replace(/\/$/, '');
// console.log(toQueueUrl, toQueueUrl.includes(this.config.domain));
if (toQueueUrl.includes(this.config.domain)) {
// clean url
// toQueueUrl = checkData.cleanUrl(toQueueUrl);
// check if already visited
if (!this.listUrlAlreadyVisited.includes(toQueueUrl)) {
// check if url has forbidden word
if (this.config.crawl.forbiddenWord.length === 0 || !new RegExp(this.config.crawl.forbiddenWord.join('|')).test(toQueueUrl)) {
if (this.config.forbiddenWord.length === 0 || !new RegExp(this.config.forbiddenWord.join('|')).test(toQueueUrl)) {
// check if url is good for crawl
if (checkData.checkCrawlUrl(toQueueUrl)) {
// add url in already visited urls
......@@ -99,6 +100,8 @@ export default class crawlWebsite extends CrawlGeneric {
},
},
);
}).catch((err) => {
console.log(err);
});
}
}
......@@ -123,11 +126,14 @@ export default class crawlWebsite extends CrawlGeneric {
*/
async parseData($, currentUrl, listPdf) {
const body = $('body');
if (this.config.excludeElement) {
$(this.config.excludeElement).remove();
}
body.html(checkData.cleanHtml(body.html()));
const title = checkData.cleanText($('title').text());
const dataForIndex = {
tag: 'website',
domain: this.config.crawl.domain,
domain: this.config.domain,
title,
title_suggest: {
input: title,
......@@ -140,8 +146,8 @@ export default class crawlWebsite extends CrawlGeneric {
createdAt: new Date(),
};
if ($(this.config.crawl.breadcrumb).text().length) {
dataForIndex.breadcrumb = checkData.cleanText($(this.config.crawl.breadcrumb).text());
if ($(this.config.breadcrumb).text().length) {
dataForIndex.breadcrumb = checkData.cleanText($(this.config.breadcrumb).text());
}
if ($('h1').text().length) {
......@@ -156,7 +162,7 @@ export default class crawlWebsite extends CrawlGeneric {
dataForIndex.listPdf = listPdf.join(' ');
}
const _index = this.config.userId;
const _index = this.userId;
const _type = 'website';
const _id = currentUrl;
......
......@@ -12,7 +12,11 @@ SyncedCron.add({
return parser.text('every 1 day');
},
job() {
notifsCollection.remove({ createdAt: moment().subtract(1, 'days').toDate() });
notifsCollection.remove({
createdAt: {
$lt: moment().subtract(1, 'd').toDate(),
},
});
},
});
......
......@@ -8,9 +8,6 @@ new Tabular.Table({
name: 'testSearch',
collection: testSearchCollection,
columns: [
{
data: 'userId', title: 'userId',
},
{
data: 'searchTerm', title: 'Terme recherché',
},
......@@ -38,6 +35,9 @@ new Tabular.Table({
tmpl: Meteor.isClient && Template.testSearchActionTable,
},
],
selector() {
return { userId: Meteor.userId() };
},
pageLength: 20,
language: {
sProcessing: 'Traitement en cours...',
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment