Commit 537809df authored by Nacim Goura's avatar Nacim Goura

finish test for search and optimize crawl and search

parent 230545f8
...@@ -20,6 +20,7 @@ Meteor.methods({ ...@@ -20,6 +20,7 @@ Meteor.methods({
$set: { $set: {
domain: config.domain || oldConfig.domain, domain: config.domain || oldConfig.domain,
forbiddenWord: config.forbiddenWord || oldConfig.forbiddenWord, forbiddenWord: config.forbiddenWord || oldConfig.forbiddenWord,
breadcrumb: config.breadcrumb || oldConfig.breadcrumb,
}, },
}); });
} else { } else {
......
...@@ -4,6 +4,7 @@ exports.analyser = { ...@@ -4,6 +4,7 @@ exports.analyser = {
settings: { settings: {
analysis: { analysis: {
filter: { filter: {
// suppression de ces mots pour diminuer le bruit
french_elision: { french_elision: {
type: 'elision', type: 'elision',
articles_case: true, articles_case: true,
...@@ -23,6 +24,7 @@ exports.analyser = { ...@@ -23,6 +24,7 @@ exports.analyser = {
'puisqu', 'puisqu',
], ],
}, },
// synonyme
french_synonym: { french_synonym: {
type: 'synonym', type: 'synonym',
ignore_case: true, ignore_case: true,
...@@ -32,12 +34,14 @@ exports.analyser = { ...@@ -32,12 +34,14 @@ exports.analyser = {
'pmi, protection maternelle et infantile', 'pmi, protection maternelle et infantile',
], ],
}, },
// radical des mots
french_stemmer: { french_stemmer: {
type: 'stemmer', type: 'stemmer',
language: 'light_french', language: 'light_french',
}, },
}, },
analyzer: { analyzer: {
// français elevé
french_heavy: { french_heavy: {
tokenizer: 'icu_tokenizer', tokenizer: 'icu_tokenizer',
filter: [ filter: [
...@@ -46,18 +50,30 @@ exports.analyser = { ...@@ -46,18 +50,30 @@ exports.analyser = {
'french_synonym', 'french_synonym',
'french_stemmer', 'french_stemmer',
'lowercase', 'lowercase',
'asciifolding',
], ],
}, },
// français léger
french_light: { french_light: {
tokenizer: 'icu_tokenizer', tokenizer: 'icu_tokenizer',
char_filter: [
'html_strip',
],
filter: [ filter: [
'french_elision', 'french_elision',
'icu_folding', 'icu_folding',
'lowercase', 'lowercase',
'asciifolding',
], ],
}, },
// analyzer for url
url_analyzer: { url_analyzer: {
tokenizer: 'uax_url_email', tokenizer: 'uax_url_email',
filter: [
'french_elision',
'icu_folding',
'lowercase',
],
}, },
}, },
}, },
...@@ -107,7 +123,7 @@ exports.mapping = { ...@@ -107,7 +123,7 @@ exports.mapping = {
}, },
html: { html: {
type: 'text', type: 'text',
analyzer: 'standard', analyzer: 'french_light',
}, },
url: { url: {
type: 'text', type: 'text',
...@@ -135,6 +151,10 @@ exports.mapping = { ...@@ -135,6 +151,10 @@ exports.mapping = {
type: 'text', type: 'text',
analyzer: 'french_light', analyzer: 'french_light',
}, },
listPdf: {
type: 'text',
analyzer: 'url_analyzer',
},
createdAt: { createdAt: {
type: 'date', type: 'date',
}, },
......
...@@ -21,9 +21,15 @@ export default class IndexWebsite extends IndexGeneric { ...@@ -21,9 +21,15 @@ export default class IndexWebsite extends IndexGeneric {
domain: url.parse(urlWebsite).hostname, domain: url.parse(urlWebsite).hostname,
forbiddenWord: [], forbiddenWord: [],
}; };
this.config = await Meteor.call('getConfig', {}); try {
const syncFunc = Meteor.wrapAsync(Meteor.call);
this.config = syncFunc('getConfig');
} catch (err) {
console.error(err);
}
console.log(this.config); console.log(this.config);
let urls = [urlWebsite]; let urls = [urlWebsite];
// si c'est un sitemap on récupère ces url // si c'est un sitemap on récupère ces url
if (checkData.isSitemap(urlWebsite)) { if (checkData.isSitemap(urlWebsite)) {
...@@ -51,7 +57,6 @@ export default class IndexWebsite extends IndexGeneric { ...@@ -51,7 +57,6 @@ export default class IndexWebsite extends IndexGeneric {
this.listDataError = []; this.listDataError = [];
this.listDataForIndex = []; this.listDataForIndex = [];
this.listUrlAlreadyVisited = urls; this.listUrlAlreadyVisited = urls;
this.listPdf = [];
console.log(`${urls.length} à parser!`); console.log(`${urls.length} à parser!`);
...@@ -64,24 +69,40 @@ export default class IndexWebsite extends IndexGeneric { ...@@ -64,24 +69,40 @@ export default class IndexWebsite extends IndexGeneric {
error: error || res.statusCode, error: error || res.statusCode,
}); });
} else if (res && res.$) { } else if (res && res.$) {
const listPdf = [];
// get inside links
res.$('a').each((index, a) => { res.$('a').each((index, a) => {
const urlHref = res.$(a).attr('href'); const urlHref = res.$(a).attr('href');
if (urlHref) { if (urlHref) {
const toQueueUrl = url.resolve(res.request.uri.href, res.$(a).attr('href')); const toQueueUrl = url.resolve(res.request.uri.href, res.$(a).attr('href'));
if (toQueueUrl.includes(this.config.domain) && !this.listUrlAlreadyVisited.includes(toQueueUrl)) { // check if same domain name
if (!new RegExp(this.config.forbiddenWord.join('|')).test(toQueueUrl)) { if (toQueueUrl.includes(this.config.domain)) {
this.listUrlAlreadyVisited.push(toQueueUrl); // clean url
console.log(toQueueUrl); // toQueueUrl = checkData.cleanUrl(toQueueUrl);
crawl.queue(toQueueUrl); // check if already visited
if (!this.listUrlAlreadyVisited.includes(toQueueUrl)) {
// check if url has forbidden word
if (!new RegExp(this.config.forbiddenWord.join('|')).test(toQueueUrl)) {
// check if url is good for crawl
if (checkData.checkCrawlUrl(toQueueUrl)) {
// add url in already visited urls
this.listUrlAlreadyVisited.push(toQueueUrl);
// check if url is pdf
if (toQueueUrl.match(/(pdf)/)) {
listPdf.push(toQueueUrl);
} else {
console.log(toQueueUrl);
crawl.queue(toQueueUrl);
}
}
}
} }
} }
} }
}); });
this.parseData(res.$, res.options.uri); // if url has not parameter => parse data
} else { if (!res.options.uri.match(/(#.*|\?.*)/g)) {
if (!this.listUrlAlreadyVisited.includes(res.options.uri)) { this.parseData(res.$, res.options.uri, listPdf);
this.listUrlAlreadyVisited.push(res.options.uri);
this.listPdf.push(res.options.uri);
} }
} }
...@@ -92,7 +113,6 @@ export default class IndexWebsite extends IndexGeneric { ...@@ -92,7 +113,6 @@ export default class IndexWebsite extends IndexGeneric {
crawl.queue(urls); crawl.queue(urls);
crawl.on('drain', () => { crawl.on('drain', () => {
console.error(this.listDataError); console.error(this.listDataError);
console.log(this.listPdf);
resolve(this.listDataForIndex); resolve(this.listDataForIndex);
}); });
}); });
...@@ -101,9 +121,10 @@ export default class IndexWebsite extends IndexGeneric { ...@@ -101,9 +121,10 @@ export default class IndexWebsite extends IndexGeneric {
/** /**
* map data * map data
* @param $ * @param $
* @param url * @param currentUrl
*/ */
parseData($, currentUrl) { parseData($, currentUrl, listPdf) {
const _this = this;
const body = $('body'); const body = $('body');
body.html(checkData.cleanHtml(body.html())); body.html(checkData.cleanHtml(body.html()));
const title = checkData.cleanText($('title').text()); const title = checkData.cleanText($('title').text());
...@@ -121,16 +142,20 @@ export default class IndexWebsite extends IndexGeneric { ...@@ -121,16 +142,20 @@ export default class IndexWebsite extends IndexGeneric {
createdAt: new Date(), createdAt: new Date(),
}; };
if ($('.breadcrumb')) { if ($(_this.config.breadcrumb).text().length) {
dataForIndex.breadcrumb = checkData.cleanText($('.breadcrumb').text()); dataForIndex.breadcrumb = checkData.slugText(checkData.cleanText($(_this.config.breadcrumb).text()));
}
if ($('h1').text().length) {
dataForIndex.h1 = checkData.slugText(checkData.cleanText($('h1').text()));
} }
if ($('h1').length) { if ($('h2').text().length) {
dataForIndex.h1 = checkData.cleanText($('h1').text()); dataForIndex.h2 = checkData.slugText(checkData.cleanText($('h2').text()));
} }
if ($('h2').length) { if (listPdf && listPdf.length) {
dataForIndex.h2 = checkData.cleanText($('h2').text()); dataForIndex.listPdf = listPdf.join(' ');
} }
this.listDataForIndex.push({ this.listDataForIndex.push({
......
import SimpleSchema from 'simpl-schema'; import SimpleSchema from 'simpl-schema';
import { Meteor } from 'meteor/meteor'; import { Meteor } from 'meteor/meteor';
import { check } from 'meteor/check';
import _ from 'lodash'; import _ from 'lodash';
import testSearchCollection from '../../collections/testSearchCollection'; import testSearchCollection from '../../collections/testSearchCollection';
Meteor.methods({ Meteor.methods({
addTest(test) { addTest(test) {
// test data // test data
new SimpleSchema({ check(test, Object);
urlExpected: { testSearchCollection.simpleSchema().namedContext().validate(test);
type: String,
regEx: SimpleSchema.RegEx.Url,
},
term: String,
}).validate(test);
// replace https by http // replace https by http
test.urlExpected = _.replace(test.urlExpected, 'https', 'http'); test.urlExpected = decodeURI(_.replace(test.urlExpected, 'https', 'http'));
// call search // call search
Meteor.call('searchByTerm', test.term, Meteor.bindEnvironment((err, results) => { Meteor.call('searchByTerm', test.term, Meteor.bindEnvironment((err, results) => {
...@@ -34,6 +30,7 @@ Meteor.methods({ ...@@ -34,6 +30,7 @@ Meteor.methods({
testSearchCollection.update(oldTest[0]._id, { testSearchCollection.update(oldTest[0]._id, {
$set: { $set: {
urlPosition: test.urlPosition, urlPosition: test.urlPosition,
createdAt: new Date(),
}, },
}); });
} else { } else {
...@@ -46,4 +43,14 @@ Meteor.methods({ ...@@ -46,4 +43,14 @@ Meteor.methods({
check(id, String); check(id, String);
testSearchCollection.remove({ _id: id }); testSearchCollection.remove({ _id: id });
}, },
launchTest() {
const tests = testSearchCollection.find({}).fetch();
_.each(tests, (test) => {
Meteor.call('addTest', test, Meteor.bindEnvironment((err) => {
if (err) {
throw new Meteor.Error('Error', 'Erreur lors de la relance des tests');
}
}));
});
},
}); });
...@@ -16,6 +16,9 @@ SimpleSchema.configCollection = new SimpleSchema({ ...@@ -16,6 +16,9 @@ SimpleSchema.configCollection = new SimpleSchema({
type: String, type: String,
regEx: SimpleSchema.RegEx.Url, regEx: SimpleSchema.RegEx.Url,
}, },
breadcrumb: {
type: String,
},
forbiddenWord: { forbiddenWord: {
type: Array, type: Array,
}, },
......
...@@ -9,6 +9,10 @@ import { Mongo } from 'meteor/mongo'; ...@@ -9,6 +9,10 @@ import { Mongo } from 'meteor/mongo';
const testSearchCollection = new Mongo.Collection('testSearch'); const testSearchCollection = new Mongo.Collection('testSearch');
SimpleSchema.testSearchCollection = new SimpleSchema({ SimpleSchema.testSearchCollection = new SimpleSchema({
_id: {
type: String,
required: false,
},
term: { term: {
type: String, type: String,
}, },
......
...@@ -23,7 +23,10 @@ Template.testSearchTpl.events({ ...@@ -23,7 +23,10 @@ Template.testSearchTpl.events({
'click .launch-test': (event) => { 'click .launch-test': (event) => {
event.preventDefault(); event.preventDefault();
Meteor.callPromise('launchTest', {})
.catch((error) => {
console.log(error);
});
}, },
}); });
......
...@@ -26,12 +26,16 @@ ...@@ -26,12 +26,16 @@
<div class="row"> <div class="row">
<form id="formReglageSite" class="form-horizontal" method="post"> <form id="formReglageSite" class="form-horizontal" method="post">
<div class="form-group"> <div class="form-group">
<label>Nom de domaine du site</label> <label for="hostUrlWebsite">Nom de domaine du site</label>
<input type="text" class="form-control hostUrlWebsite" name="hostUrlWebsite" required="true"> <input id="hostUrlWebsite" type="text" class="form-control" name="hostUrlWebsite" required="true">
</div> </div>
<div class="form-group"> <div class="form-group">
<label>Mot dans l'url à ne pas indexer (séparé par une virgule)</label> <label for="forbiddenWordWebsite">Mot dans l'url à ne pas indexer (séparé par une virgule)</label>
<input type="text" class="form-control forbiddenWordWebsite" name="forbiddenWordWebsite" data-role="tagsinput" required="true"> <input id="forbiddenWordWebsite" type="text" class="form-control" name="forbiddenWordWebsite" data-role="tagsinput" required="true">
</div>
<div class="form-group">
<label for="classBreadcrumbWebsite">Element du breadcrumb</label>
<input id="breadcrumbWebsite" type="text" class="form-control" name="breadcrumbWebsite" required="true">
</div> </div>
<input type="submit" class="btn btn-success" value="Valider"> <input type="submit" class="btn btn-success" value="Valider">
</form> </form>
......
...@@ -8,13 +8,15 @@ import displayNotif from '../../../../components/notifs/notifs.js'; ...@@ -8,13 +8,15 @@ import displayNotif from '../../../../components/notifs/notifs.js';
import './site.html'; import './site.html';
Template.siteIndexationTpl.onRendered(() => { Template.siteIndexationTpl.onRendered(() => {
$('.forbiddenWordWebsite').tagsinput(); $('#forbiddenWordWebsite').tagsinput();
$('.bootstrap-tagsinput').addClass('form-control');
Meteor.callPromise('getConfig', {}) Meteor.callPromise('getConfig', {})
.then((config) => { .then((config) => {
$('.hostUrlWebsite').val(config.domain); $('#hostUrlWebsite').val(config.domain);
$('#breadcrumbWebsite').val(config.breadcrumb);
if (config.forbiddenWord && config.forbiddenWord.length) { if (config.forbiddenWord && config.forbiddenWord.length) {
config.forbiddenWord = config.forbiddenWord.join(','); config.forbiddenWord = config.forbiddenWord.join(',');
$('.forbiddenWordWebsite').tagsinput('add', config.forbiddenWord); $('#forbiddenWordWebsite').tagsinput('add', config.forbiddenWord);
} }
$('.bootstrap-tagsinput').addClass('form-control'); $('.bootstrap-tagsinput').addClass('form-control');
}); });
...@@ -62,6 +64,7 @@ Template.siteIndexationTpl.events({ ...@@ -62,6 +64,7 @@ Template.siteIndexationTpl.events({
const config = { const config = {
domain: event.target.hostUrlWebsite.value, domain: event.target.hostUrlWebsite.value,
forbiddenWord: event.target.forbiddenWordWebsite.value, forbiddenWord: event.target.forbiddenWordWebsite.value,
breadcrumb: event.target.breadcrumbWebsite.value,
}; };
Meteor.callPromise('defineConfig', config) Meteor.callPromise('defineConfig', config)
...@@ -81,6 +84,6 @@ Template.siteIndexationTpl.events({ ...@@ -81,6 +84,6 @@ Template.siteIndexationTpl.events({
}); });
}); });
event.target.reset(); event.target.reset();
$('.forbiddenWordWebsite').tagsinput('removeAll'); $('#forbiddenWordWebsite').tagsinput('removeAll');
}, },
}); });
...@@ -46,6 +46,9 @@ const checkData = { ...@@ -46,6 +46,9 @@ const checkData = {
'nl', 'li', 'b', 'i', 'strong', 'em', 'strike', 'code', 'hr', 'br', 'div', 'nl', 'li', 'b', 'i', 'strong', 'em', 'strike', 'code', 'hr', 'br', 'div',
'table', 'thead', 'caption', 'tbody', 'tr', 'th', 'td', 'pre'], 'table', 'thead', 'caption', 'tbody', 'tr', 'th', 'td', 'pre'],
allowedAttributes: [], allowedAttributes: [],
allowedClasses: {
div: ['breadcrumb'],
},
selfClosing: ['img', 'br', 'hr', 'area', 'base', 'basefont', 'input', 'link', 'meta'], selfClosing: ['img', 'br', 'hr', 'area', 'base', 'basefont', 'input', 'link', 'meta'],
allowedSchemes: ['http', 'https', 'ftp', 'mailto'], allowedSchemes: ['http', 'https', 'ftp', 'mailto'],
allowedSchemesByTag: {}, allowedSchemesByTag: {},
...@@ -54,9 +57,14 @@ const checkData = { ...@@ -54,9 +57,14 @@ const checkData = {
}, },
// url pour crawler // url pour crawler
checkURL(url) { checkCrawlUrl(url) {
return !url.match(/\.(jpeg|jpg|gif|png|js|css|ico|eot|svg|woff|ttf)/); return !url.match(/\.(jpeg|jpg|gif|png|js|css|ico|eot|svg|woff|ttf)/);
}, },
// clean url (remove text after # and ?)
cleanUrl(url) {
return url.replace(/(#.*)/g, '').replace(/\/{2,}/g, '/');
},
}; };
export default checkData; export default checkData;
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment