Commit 230545f8 authored by Nacim Goura's avatar Nacim Goura

add reglage for crawl

parent 81979ffb
......@@ -32,6 +32,8 @@
"import/extensions": ["off", "never"],
"import/no-extraneous-dependencies": "off",
"no-underscore-dangle": "off",
"class-methods-use-this": "off"
"class-methods-use-this": "off",
"no-param-reassign": "off",
"meteor/no-session": "off"
}
}
......@@ -45,3 +45,4 @@ practicalmeteor:chai
deanius:promise
dynamic-import
aldeed:tabular
ajduke:bootstrap-tagsinput
accounts-base@1.3.0
accounts-password@1.3.6
ajduke:bootstrap-tagsinput@0.7.1
alanning:roles@1.2.16
aldeed:autoform@6.2.0
aldeed:collection2-core@2.0.1
......@@ -99,6 +100,7 @@ templating-tools@1.1.2
tmeasday:check-npm-versions@0.3.1
tmeasday:test-reporter-helpers@0.2.1
tracker@1.1.3
twbs:bootstrap@3.3.6
ui@1.0.13
underscore@1.0.10
url@1.1.0
......
,ngoura,ngoura,15.06.2017 09:13,file:///home/ngoura/.config/libreoffice/4;
\ No newline at end of file
import SimpleSchema from 'simpl-schema';
import { check } from 'meteor/check';
import { Meteor } from 'meteor/meteor';
import { Accounts } from 'meteor/accounts-base';
Meteor.methods({
// add account
addAccount(user) {
check(user, Object);
// validate user data
new SimpleSchema({
email: {
type: String,
regEx: SimpleSchema.RegEx.Email,
},
username: String,
password: String,
}).validate(user);
const id = Accounts.createUser(user);
if (!id) {
throw new Meteor.Error('Error', 'Impossible de se connecter');
}
},
// delete account
deleteAccount(id) {
check(id, String);
Meteor.users.remove({ _id: id });
......
import { check } from 'meteor/check';
import { Meteor } from 'meteor/meteor';
import configCollection from '../../collections/configCollection';
Meteor.methods({
// define config
defineConfig(config) {
check(config, Object);
const oldConfig = configCollection.find({ userId: Meteor.userId() }).fetch();
if (config.forbiddenWord) {
config.forbiddenWord = config.forbiddenWord.split(',');
}
if (config.domain) {
config.domain = config.domain.replace('https', 'http');
}
// if config already exist, replace old value with new
if (oldConfig && oldConfig.length) {
configCollection.update(oldConfig[0]._id, {
$set: {
domain: config.domain || oldConfig.domain,
forbiddenWord: config.forbiddenWord || oldConfig.forbiddenWord,
},
});
} else {
// else insert
config.userId = Meteor.userId();
configCollection.insert(config);
}
},
getConfig() {
return configCollection.findOne({ userId: Meteor.userId() });
},
});
......@@ -10,11 +10,12 @@ export default class crawlWebsite {
constructor(url) {
this.urls = [];
this.crawler = new Crawler(url);
let i = 0;
this.crawler.on('fetchcomplete', (queueItem) => {
const url = queueItem.url;
console.log(i++);
if (checkData.checkURL(url)) {
console.log(url);
this.urls.push(url);
}
});
......
......@@ -29,6 +29,7 @@ exports.analyser = {
expand: true,
synonyms: [
'gosse, enfant',
'pmi, protection maternelle et infantile',
],
},
french_stemmer: {
......@@ -55,6 +56,9 @@ exports.analyser = {
'lowercase',
],
},
url_analyzer: {
tokenizer: 'uax_url_email',
},
},
},
},
......@@ -107,7 +111,17 @@ exports.mapping = {
},
url: {
type: 'text',
analyzer: 'standard',
analyzer: 'url_analyzer',
},
urlText: {
type: 'text',
analyzer: 'french_light',
fields: {
stemmed: {
type: 'text',
analyzer: 'french_heavy',
},
},
},
h1: {
type: 'text',
......
......@@ -64,6 +64,8 @@ export default class IndexGeneric {
'description.stemmed',
'body',
'body.stemmed',
'urlText',
'urlText.stemmed',
'url',
],
},
......
import url from 'url';
import { Meteor } from 'meteor/meteor';
import Sitemapper from 'sitemapper';
import Crawler from 'crawler';
......@@ -12,23 +13,28 @@ export default class IndexWebsite extends IndexGeneric {
/**
* realize indexation
* @param url
* @param urlWebsite
* @returns Promise
*/
async initIndexation(url) {
const crawl = new CrawlWebsite(url);
async initIndexation(urlWebsite) {
this.config = {
domain: url.parse(urlWebsite).hostname,
forbiddenWord: [],
};
this.config = await Meteor.call('getConfig', {});
/* let urls = [url];
console.log(this.config);
let urls = [urlWebsite];
// si c'est un sitemap on récupère ces url
if (checkData.isSitemap(url)) {
const { sites } = await sitemap.fetch(url);
if (checkData.isSitemap(urlWebsite)) {
const { sites } = await sitemap.fetch(urlWebsite);
urls = sites;
}
return this.crawlUrl(_.uniq(urls))
.then(dataToIndex => this.indexByBulk(dataToIndex))
.catch((error) => {
throw error;
});*/
});
}
/**
......@@ -44,18 +50,39 @@ export default class IndexWebsite extends IndexGeneric {
this.listDataError = [];
this.listDataForIndex = [];
this.listUrlAlreadyVisited = urls;
this.listPdf = [];
console.log(`${urls.length} à parser!`);
const crawl = new Crawler({
skipDuplicates: true,
callback: (error, res, done) => {
if (error || res.statusCode !== 200) {
this.listDataError.push({
url: res.options.uri,
error: error || res.statusCode,
});
} else {
} else if (res && res.$) {
res.$('a').each((index, a) => {
const urlHref = res.$(a).attr('href');
if (urlHref) {
const toQueueUrl = url.resolve(res.request.uri.href, res.$(a).attr('href'));
if (toQueueUrl.includes(this.config.domain) && !this.listUrlAlreadyVisited.includes(toQueueUrl)) {
if (!new RegExp(this.config.forbiddenWord.join('|')).test(toQueueUrl)) {
this.listUrlAlreadyVisited.push(toQueueUrl);
console.log(toQueueUrl);
crawl.queue(toQueueUrl);
}
}
}
});
this.parseData(res.$, res.options.uri);
} else {
if (!this.listUrlAlreadyVisited.includes(res.options.uri)) {
this.listUrlAlreadyVisited.push(res.options.uri);
this.listPdf.push(res.options.uri);
}
}
done();
......@@ -65,6 +92,7 @@ export default class IndexWebsite extends IndexGeneric {
crawl.queue(urls);
crawl.on('drain', () => {
console.error(this.listDataError);
console.log(this.listPdf);
resolve(this.listDataForIndex);
});
});
......@@ -75,7 +103,7 @@ export default class IndexWebsite extends IndexGeneric {
* @param $
* @param url
*/
parseData($, url) {
parseData($, currentUrl) {
const body = $('body');
body.html(checkData.cleanHtml(body.html()));
const title = checkData.cleanText($('title').text());
......@@ -88,7 +116,8 @@ export default class IndexWebsite extends IndexGeneric {
description: $('meta[name=description]').attr('content'),
body: checkData.slugText(checkData.cleanText(body.text())),
html: checkData.cleanText(body.html()),
url: decodeURI(url),
urlText: checkData.slugText(checkData.cleanText(decodeURI(currentUrl))),
url: decodeURI(currentUrl),
createdAt: new Date(),
};
......@@ -108,7 +137,7 @@ export default class IndexWebsite extends IndexGeneric {
index: {
_index: Meteor.settings.private.elasticsearch.esIndex,
_type: Meteor.settings.private.elasticsearch.esType,
_id: url,
_id: currentUrl,
},
});
......
import { check } from 'meteor/check';
import SimpleSchema from 'simpl-schema';
import { Meteor } from 'meteor/meteor';
import _ from 'lodash';
import testSearchCollection from '../../collections/testSearchCollection';
Meteor.methods({
addTest(test) {
check(test, Object);
// test data
new SimpleSchema({
urlExpected: {
type: String,
regEx: SimpleSchema.RegEx.Url,
},
term: String,
}).validate(test);
// replace https by http
test.urlExpected = _.replace(test.urlExpected, 'https', 'http');
// call search
Meteor.call('searchByTerm', test.term, Meteor.bindEnvironment((err, results) => {
if (err) {
console.log(err);
......@@ -18,8 +29,17 @@ Meteor.methods({
test.urlPosition = index + 1;
}
});
const oldTest = testSearchCollection.find({ term: test.term }).fetch();
if (oldTest && oldTest.length) {
testSearchCollection.update(oldTest[0]._id, {
$set: {
urlPosition: test.urlPosition,
},
});
} else {
testSearchCollection.insert(test);
}
}
}));
},
deleteTest(id) {
......
import SimpleSchema from 'simpl-schema';
import { Mongo } from 'meteor/mongo';
/**
* this collection is for config (crawl, api, index)
* @type {Mongo.Collection}
*/
const configCollection = new Mongo.Collection('configs');
SimpleSchema.configCollection = new SimpleSchema({
userId: {
type: String,
},
domain: {
type: String,
regEx: SimpleSchema.RegEx.Url,
},
forbiddenWord: {
type: Array,
},
'forbiddenWord.$': {
type: String,
},
});
configCollection.attachSchema(SimpleSchema.configCollection);
export default configCollection;
......@@ -3,7 +3,7 @@ import SimpleSchema from 'simpl-schema';
import { Mongo } from 'meteor/mongo';
/**
* this local collection keep tracks of all tabs of this application
* this collection is temporaire for testsearch
* @type {Mongo.Collection}
*/
const testSearchCollection = new Mongo.Collection('testSearch');
......
......@@ -2,6 +2,7 @@
import '../../api/indexation/methods';
import '../../api/account/methods';
import '../../api/config/methods';
import '../../api/testSearch/methods';
import '../../tabular/tabularUser';
import '../../tabular/tabularTestSearch';
......@@ -17,6 +17,9 @@
<div class="panel-body">
{{> tabular table=TabularTables.testSearch class="table table-striped table-bordered table-condensed text-center"}}
</div>
<div class="panel-footer">
<button class="btn btn-primary launch-test">Relancer les tests</button>
</div>
</div>
......
......@@ -20,6 +20,11 @@ Template.testSearchTpl.events({
console.log(error);
});
},
'click .launch-test': (event) => {
event.preventDefault();
},
});
Template.testSearchActionTable.events({
......
......@@ -6,6 +6,7 @@ import displayNotif from '../../../../components/notifs/notifs';
import './add.html';
Template.addAccountTpl.events({
// add account
'submit form': (event) => {
event.preventDefault();
......
<template name="apiIndexationTpl">
<div class="panel panel-default wrapper">
<div class="panel-body">
<h3 class="text-center">Gestion de l'indexation des API</h3>
<h4 class="text-center">Gestion de l'indexation des API</h4>
</div>
</div>
......
......@@ -4,7 +4,7 @@
<div class="panel panel-default wrapper">
<div class="panel-body">
<h3 class="text-center">Gestion de l'indexation en général</h3>
<h4 class="text-center">Gestion de l'indexation en général</h4>
<button class="btn btn-info" id="initElastic">Initialisation ElasticSearch</button>
<button class="btn btn-primary" id="reindexElastic">Réindexation ElasticSearch</button>
</div>
......
......@@ -4,8 +4,8 @@
<div class="panel panel-default wrapper">
<div class="panel-body">
<h3 class="text-center">Gestion de l'indexation des sites</h3>
<form class="form-horizontal" id="formUrlSite">
<h4 class="text-center">Gestion de l'indexation des sites</h4>
<form id="formUrlSite" class="form-horizontal" method="post">
<label class="control-label">Url du site ou du sitemap à indexer : </label>
<div class="input-group">
<input type="url" class="form-control" placeholder="Url" required="required" name="urlSite">
......@@ -19,6 +19,23 @@
Indexation en cours...
</h4>
</form>
<hr>
<h4 class="text-center">Réglages</h4>
<div class="row">
<form id="formReglageSite" class="form-horizontal" method="post">
<div class="form-group">
<label>Nom de domaine du site</label>
<input type="text" class="form-control hostUrlWebsite" name="hostUrlWebsite" required="true">
</div>
<div class="form-group">
<label>Mot dans l'url à ne pas indexer (séparé par une virgule)</label>
<input type="text" class="form-control forbiddenWordWebsite" name="forbiddenWordWebsite" data-role="tagsinput" required="true">
</div>
<input type="submit" class="btn btn-success" value="Valider">
</form>
</div>
</div>
</div>
</template>
......@@ -7,6 +7,20 @@ import displayNotif from '../../../../components/notifs/notifs.js';
import './site.html';
Template.siteIndexationTpl.onRendered(() => {
$('.forbiddenWordWebsite').tagsinput();
Meteor.callPromise('getConfig', {})
.then((config) => {
$('.hostUrlWebsite').val(config.domain);
if (config.forbiddenWord && config.forbiddenWord.length) {
config.forbiddenWord = config.forbiddenWord.join(',');
$('.forbiddenWordWebsite').tagsinput('add', config.forbiddenWord);
}
$('.bootstrap-tagsinput').addClass('form-control');
});
});
Template.siteIndexationTpl.events({
'submit #formUrlSite': (event) => {
event.preventDefault();
......@@ -42,4 +56,31 @@ Template.siteIndexationTpl.events({
});
event.target.reset();
},
'submit #formReglageSite': (event) => {
event.preventDefault();
const config = {
domain: event.target.hostUrlWebsite.value,
forbiddenWord: event.target.forbiddenWordWebsite.value,
};
Meteor.callPromise('defineConfig', config)
.then(() => {
displayNotif({
type: 'success',
title: 'Configuration : ',
message: 'Configuration ajouté avec succès!',
save: true,
});
}).catch((error) => {
displayNotif({
type: 'error',
title: 'Configuration : ',
message: error.reason ? error.reason : error,
save: true,
});
});
event.target.reset();
$('.forbiddenWordWebsite').tagsinput('removeAll');
},
});
import { Meteor } from 'meteor/meteor';
import { Session } from 'meteor/session';
import { FlowRouter } from 'meteor/kadira:flow-router';
import { Template } from 'meteor/templating';
import displayNotif from '../../components/notifs/notifs';
......@@ -29,6 +30,12 @@ Template.loginTpl.events({
title: 'Succès : ',
message: `Bienvenue ${user.login}!`,
});
// get global config for user
Meteor.callPromise('getConfig', {})
.then((config) => {
console.log(config);
Session.set('config', config);
});
}
});
},
......
......@@ -896,20 +896,13 @@ domhandler@^2.3.0:
dependencies:
domelementtype "1"
domutils@1.5.1:
domutils@1.5.1, domutils@^1.5.1:
version "1.5.1"
resolved "https://registry.yarnpkg.com/domutils/-/domutils-1.5.1.tgz#dcd8488a26f563d61079e48c9f7b7e32373682cf"
dependencies:
dom-serializer "0"
domelementtype "1"
domutils@^1.5.1:
version "1.6.2"
resolved "https://registry.yarnpkg.com/domutils/-/domutils-1.6.2.tgz#1958cc0b4c9426e9ed367fb1c8e854891b0fa3ff"
dependencies:
dom-serializer "0"
domelementtype "1"
double-ended-queue@^2.1.0-0:
version "2.1.0-0"
resolved "https://registry.yarnpkg.com/double-ended-queue/-/double-ended-queue-2.1.0-0.tgz#103d3527fd31528f40188130c841efdd78264e5c"
......@@ -924,9 +917,9 @@ ee-first@1.1.1:
version "1.1.1"
resolved "https://registry.yarnpkg.com/ee-first/-/ee-first-1.1.1.tgz#590c61156b0ae2f4f0255732a158b266bc56b21d"
elasticsearch@^13.1.0:
version "13.1.0"
resolved "https://registry.yarnpkg.com/elasticsearch/-/elasticsearch-13.1.0.tgz#f22a2afe01f4c3cde660bdd5b77575c6f8b6a397"
elasticsearch@^13.1.1:
version "13.1.1"
resolved "https://registry.yarnpkg.com/elasticsearch/-/elasticsearch-13.1.1.tgz#83dbb5cf76ecc9bdd3ffceb5ce74d5fc7d23e798"
dependencies:
agentkeepalive "^2.2.0"
chalk "^1.0.0"
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment