Commit 537809df authored by Nacim Goura's avatar Nacim Goura

finish test for search and optimize crawl and search

parent 230545f8
......@@ -20,6 +20,7 @@ Meteor.methods({
$set: {
domain: config.domain || oldConfig.domain,
forbiddenWord: config.forbiddenWord || oldConfig.forbiddenWord,
breadcrumb: config.breadcrumb || oldConfig.breadcrumb,
},
});
} else {
......
......@@ -4,6 +4,7 @@ exports.analyser = {
settings: {
analysis: {
filter: {
// suppression de ces mots pour diminuer le bruit
french_elision: {
type: 'elision',
articles_case: true,
......@@ -23,6 +24,7 @@ exports.analyser = {
'puisqu',
],
},
// synonyme
french_synonym: {
type: 'synonym',
ignore_case: true,
......@@ -32,12 +34,14 @@ exports.analyser = {
'pmi, protection maternelle et infantile',
],
},
// radical des mots
french_stemmer: {
type: 'stemmer',
language: 'light_french',
},
},
analyzer: {
// français elevé
french_heavy: {
tokenizer: 'icu_tokenizer',
filter: [
......@@ -46,18 +50,30 @@ exports.analyser = {
'french_synonym',
'french_stemmer',
'lowercase',
'asciifolding',
],
},
// français léger
french_light: {
tokenizer: 'icu_tokenizer',
char_filter: [
'html_strip',
],
filter: [
'french_elision',
'icu_folding',
'lowercase',
'asciifolding',
],
},
// analyzer for url
url_analyzer: {
tokenizer: 'uax_url_email',
filter: [
'french_elision',
'icu_folding',
'lowercase',
],
},
},
},
......@@ -107,7 +123,7 @@ exports.mapping = {
},
html: {
type: 'text',
analyzer: 'standard',
analyzer: 'french_light',
},
url: {
type: 'text',
......@@ -135,6 +151,10 @@ exports.mapping = {
type: 'text',
analyzer: 'french_light',
},
listPdf: {
type: 'text',
analyzer: 'url_analyzer',
},
createdAt: {
type: 'date',
},
......
......@@ -21,9 +21,15 @@ export default class IndexWebsite extends IndexGeneric {
domain: url.parse(urlWebsite).hostname,
forbiddenWord: [],
};
this.config = await Meteor.call('getConfig', {});
try {
const syncFunc = Meteor.wrapAsync(Meteor.call);
this.config = syncFunc('getConfig');
} catch (err) {
console.error(err);
}
console.log(this.config);
let urls = [urlWebsite];
// si c'est un sitemap on récupère ces url
if (checkData.isSitemap(urlWebsite)) {
......@@ -51,7 +57,6 @@ export default class IndexWebsite extends IndexGeneric {
this.listDataError = [];
this.listDataForIndex = [];
this.listUrlAlreadyVisited = urls;
this.listPdf = [];
console.log(`${urls.length} à parser!`);
......@@ -64,24 +69,40 @@ export default class IndexWebsite extends IndexGeneric {
error: error || res.statusCode,
});
} else if (res && res.$) {
const listPdf = [];
// get inside links
res.$('a').each((index, a) => {
const urlHref = res.$(a).attr('href');
if (urlHref) {
const toQueueUrl = url.resolve(res.request.uri.href, res.$(a).attr('href'));
if (toQueueUrl.includes(this.config.domain) && !this.listUrlAlreadyVisited.includes(toQueueUrl)) {
if (!new RegExp(this.config.forbiddenWord.join('|')).test(toQueueUrl)) {
this.listUrlAlreadyVisited.push(toQueueUrl);
console.log(toQueueUrl);
crawl.queue(toQueueUrl);
// check if same domain name
if (toQueueUrl.includes(this.config.domain)) {
// clean url
// toQueueUrl = checkData.cleanUrl(toQueueUrl);
// check if already visited
if (!this.listUrlAlreadyVisited.includes(toQueueUrl)) {
// check if url has forbidden word
if (!new RegExp(this.config.forbiddenWord.join('|')).test(toQueueUrl)) {
// check if url is good for crawl
if (checkData.checkCrawlUrl(toQueueUrl)) {
// add url in already visited urls
this.listUrlAlreadyVisited.push(toQueueUrl);
// check if url is pdf
if (toQueueUrl.match(/(pdf)/)) {
listPdf.push(toQueueUrl);
} else {
console.log(toQueueUrl);
crawl.queue(toQueueUrl);
}
}
}
}
}
}
});
this.parseData(res.$, res.options.uri);
} else {
if (!this.listUrlAlreadyVisited.includes(res.options.uri)) {
this.listUrlAlreadyVisited.push(res.options.uri);
this.listPdf.push(res.options.uri);
// if url has not parameter => parse data
if (!res.options.uri.match(/(#.*|\?.*)/g)) {
this.parseData(res.$, res.options.uri, listPdf);
}
}
......@@ -92,7 +113,6 @@ export default class IndexWebsite extends IndexGeneric {
crawl.queue(urls);
crawl.on('drain', () => {
console.error(this.listDataError);
console.log(this.listPdf);
resolve(this.listDataForIndex);
});
});
......@@ -101,9 +121,10 @@ export default class IndexWebsite extends IndexGeneric {
/**
* map data
* @param $
* @param url
* @param currentUrl
*/
parseData($, currentUrl) {
parseData($, currentUrl, listPdf) {
const _this = this;
const body = $('body');
body.html(checkData.cleanHtml(body.html()));
const title = checkData.cleanText($('title').text());
......@@ -121,16 +142,20 @@ export default class IndexWebsite extends IndexGeneric {
createdAt: new Date(),
};
if ($('.breadcrumb')) {
dataForIndex.breadcrumb = checkData.cleanText($('.breadcrumb').text());
if ($(_this.config.breadcrumb).text().length) {
dataForIndex.breadcrumb = checkData.slugText(checkData.cleanText($(_this.config.breadcrumb).text()));
}
if ($('h1').text().length) {
dataForIndex.h1 = checkData.slugText(checkData.cleanText($('h1').text()));
}
if ($('h1').length) {
dataForIndex.h1 = checkData.cleanText($('h1').text());
if ($('h2').text().length) {
dataForIndex.h2 = checkData.slugText(checkData.cleanText($('h2').text()));
}
if ($('h2').length) {
dataForIndex.h2 = checkData.cleanText($('h2').text());
if (listPdf && listPdf.length) {
dataForIndex.listPdf = listPdf.join(' ');
}
this.listDataForIndex.push({
......
import SimpleSchema from 'simpl-schema';
import { Meteor } from 'meteor/meteor';
import { check } from 'meteor/check';
import _ from 'lodash';
import testSearchCollection from '../../collections/testSearchCollection';
Meteor.methods({
addTest(test) {
// test data
new SimpleSchema({
urlExpected: {
type: String,
regEx: SimpleSchema.RegEx.Url,
},
term: String,
}).validate(test);
check(test, Object);
testSearchCollection.simpleSchema().namedContext().validate(test);
// replace https by http
test.urlExpected = _.replace(test.urlExpected, 'https', 'http');
test.urlExpected = decodeURI(_.replace(test.urlExpected, 'https', 'http'));
// call search
Meteor.call('searchByTerm', test.term, Meteor.bindEnvironment((err, results) => {
......@@ -34,6 +30,7 @@ Meteor.methods({
testSearchCollection.update(oldTest[0]._id, {
$set: {
urlPosition: test.urlPosition,
createdAt: new Date(),
},
});
} else {
......@@ -46,4 +43,14 @@ Meteor.methods({
check(id, String);
testSearchCollection.remove({ _id: id });
},
launchTest() {
const tests = testSearchCollection.find({}).fetch();
_.each(tests, (test) => {
Meteor.call('addTest', test, Meteor.bindEnvironment((err) => {
if (err) {
throw new Meteor.Error('Error', 'Erreur lors de la relance des tests');
}
}));
});
},
});
......@@ -16,6 +16,9 @@ SimpleSchema.configCollection = new SimpleSchema({
type: String,
regEx: SimpleSchema.RegEx.Url,
},
breadcrumb: {
type: String,
},
forbiddenWord: {
type: Array,
},
......
......@@ -9,6 +9,10 @@ import { Mongo } from 'meteor/mongo';
const testSearchCollection = new Mongo.Collection('testSearch');
SimpleSchema.testSearchCollection = new SimpleSchema({
_id: {
type: String,
required: false,
},
term: {
type: String,
},
......
......@@ -23,7 +23,10 @@ Template.testSearchTpl.events({
'click .launch-test': (event) => {
event.preventDefault();
Meteor.callPromise('launchTest', {})
.catch((error) => {
console.log(error);
});
},
});
......
......@@ -26,12 +26,16 @@
<div class="row">
<form id="formReglageSite" class="form-horizontal" method="post">
<div class="form-group">
<label>Nom de domaine du site</label>
<input type="text" class="form-control hostUrlWebsite" name="hostUrlWebsite" required="true">
<label for="hostUrlWebsite">Nom de domaine du site</label>
<input id="hostUrlWebsite" type="text" class="form-control" name="hostUrlWebsite" required="true">
</div>
<div class="form-group">
<label>Mot dans l'url à ne pas indexer (séparé par une virgule)</label>
<input type="text" class="form-control forbiddenWordWebsite" name="forbiddenWordWebsite" data-role="tagsinput" required="true">
<label for="forbiddenWordWebsite">Mot dans l'url à ne pas indexer (séparé par une virgule)</label>
<input id="forbiddenWordWebsite" type="text" class="form-control" name="forbiddenWordWebsite" data-role="tagsinput" required="true">
</div>
<div class="form-group">
<label for="classBreadcrumbWebsite">Element du breadcrumb</label>
<input id="breadcrumbWebsite" type="text" class="form-control" name="breadcrumbWebsite" required="true">
</div>
<input type="submit" class="btn btn-success" value="Valider">
</form>
......
......@@ -8,13 +8,15 @@ import displayNotif from '../../../../components/notifs/notifs.js';
import './site.html';
Template.siteIndexationTpl.onRendered(() => {
$('.forbiddenWordWebsite').tagsinput();
$('#forbiddenWordWebsite').tagsinput();
$('.bootstrap-tagsinput').addClass('form-control');
Meteor.callPromise('getConfig', {})
.then((config) => {
$('.hostUrlWebsite').val(config.domain);
$('#hostUrlWebsite').val(config.domain);
$('#breadcrumbWebsite').val(config.breadcrumb);
if (config.forbiddenWord && config.forbiddenWord.length) {
config.forbiddenWord = config.forbiddenWord.join(',');
$('.forbiddenWordWebsite').tagsinput('add', config.forbiddenWord);
$('#forbiddenWordWebsite').tagsinput('add', config.forbiddenWord);
}
$('.bootstrap-tagsinput').addClass('form-control');
});
......@@ -62,6 +64,7 @@ Template.siteIndexationTpl.events({
const config = {
domain: event.target.hostUrlWebsite.value,
forbiddenWord: event.target.forbiddenWordWebsite.value,
breadcrumb: event.target.breadcrumbWebsite.value,
};
Meteor.callPromise('defineConfig', config)
......@@ -81,6 +84,6 @@ Template.siteIndexationTpl.events({
});
});
event.target.reset();
$('.forbiddenWordWebsite').tagsinput('removeAll');
$('#forbiddenWordWebsite').tagsinput('removeAll');
},
});
......@@ -46,6 +46,9 @@ const checkData = {
'nl', 'li', 'b', 'i', 'strong', 'em', 'strike', 'code', 'hr', 'br', 'div',
'table', 'thead', 'caption', 'tbody', 'tr', 'th', 'td', 'pre'],
allowedAttributes: [],
allowedClasses: {
div: ['breadcrumb'],
},
selfClosing: ['img', 'br', 'hr', 'area', 'base', 'basefont', 'input', 'link', 'meta'],
allowedSchemes: ['http', 'https', 'ftp', 'mailto'],
allowedSchemesByTag: {},
......@@ -54,9 +57,14 @@ const checkData = {
},
// url pour crawler
checkURL(url) {
checkCrawlUrl(url) {
return !url.match(/\.(jpeg|jpg|gif|png|js|css|ico|eot|svg|woff|ttf)/);
},
// clean url (remove text after # and ?)
cleanUrl(url) {
return url.replace(/(#.*)/g, '').replace(/\/{2,}/g, '/');
},
};
export default checkData;
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment