Commit fef1fde1 authored by Nacim Goura's avatar Nacim Goura

finish demo

parent fc0518f2
...@@ -38,10 +38,12 @@ SimpleSchema.configCollection = new SimpleSchema({ ...@@ -38,10 +38,12 @@ SimpleSchema.configCollection = new SimpleSchema({
'listConfig.$.breadcrumb': { 'listConfig.$.breadcrumb': {
type: String, type: String,
label: 'Element du breadcrumb', label: 'Element du breadcrumb',
required: false,
}, },
'listConfig.$.forbiddenWordString': { 'listConfig.$.forbiddenWordString': {
type: String, type: String,
label: 'Mot non indexable (à séparer par une virgule)', label: 'Mot non indexable (à séparer par une virgule)',
required: false,
autoform: { autoform: {
class: 'forbiddenWordWebsite', class: 'forbiddenWordWebsite',
}, },
......
import CrawlFacebook from './crawlFacebook'; import CrawlGeneric from '/imports/api/crawl/crawlGeneric';
import CrawlTwitter from './crawlTwitter'; import CrawlFacebook from '/imports/api/crawl/api/server/crawlFacebook';
import CrawlTwitter from '/imports/api/crawl/api/server/crawlTwitter';
export default class crawlApi { export default class crawlApi extends CrawlGeneric {
constructor(config) { constructor(data) {
switch (config.type) { super();
switch (data.type) {
case 'facebook': case 'facebook':
return new CrawlFacebook(config); return new CrawlFacebook(this.config, data);
case 'twitter': case 'twitter':
return new CrawlTwitter(config); return new CrawlTwitter(this.config, data);
default: default:
throw new Meteor.Error('Error', 'Aucun type pour l\'API indiqué'); throw new Meteor.Error('Error', 'Aucun type pour l\'API indiqué');
} }
......
...@@ -5,13 +5,13 @@ import checkData from '/imports/utils/checkData'; ...@@ -5,13 +5,13 @@ import checkData from '/imports/utils/checkData';
export default class CrawlFacebook { export default class CrawlFacebook {
constructor(config) { constructor(data, config) {
this.config = config; this.config = config;
console.log('crawl Facebook'); console.log('crawl Facebook');
this.listDataForIndex = []; this.listDataForIndex = [];
if (config.content) { if (data.content) {
this.content = JSON.parse(config.content); this.content = JSON.parse(data.content);
return this.start(); return this.start();
} }
throw new Meteor.Error('Error', 'aucune donnée Facebook!'); throw new Meteor.Error('Error', 'aucune donnée Facebook!');
...@@ -71,7 +71,7 @@ export default class CrawlFacebook { ...@@ -71,7 +71,7 @@ export default class CrawlFacebook {
this.listDataForIndex.push({ this.listDataForIndex.push({
index: { index: {
_index: Meteor.settings.private.elasticsearch.esIndex, _index: Meteor.settings.private.elasticsearch.esIndex,
_type: Meteor.settings.private.elasticsearch.esType, _type: this.config.userId,
_id: dataForIndex.url, _id: dataForIndex.url,
}, },
}); });
......
...@@ -6,9 +6,10 @@ import checkData from '/imports/utils/checkData'; ...@@ -6,9 +6,10 @@ import checkData from '/imports/utils/checkData';
export default class CrawlTwitter { export default class CrawlTwitter {
constructor(config) { constructor(config, data) {
console.log('crawl Twittter'); console.log('crawl Twittter');
this.config = config; this.config = config;
this.data = data;
this.listDataForIndex = []; this.listDataForIndex = [];
this.client = new Twitter({ this.client = new Twitter({
...@@ -27,7 +28,7 @@ export default class CrawlTwitter { ...@@ -27,7 +28,7 @@ export default class CrawlTwitter {
*/ */
async start() { async start() {
try { try {
const tweets = await this.client.get('statuses/user_timeline', { screen_name: this.config.idPage }); const tweets = await this.client.get('statuses/user_timeline', { screen_name: this.data.idPage });
return this.parseData(tweets); return this.parseData(tweets);
} catch (e) { } catch (e) {
throw new Meteor.Error('Error', 'Erreur lors l\'utilisation de l\'API Twitter', 'statuses/user_timeline'); throw new Meteor.Error('Error', 'Erreur lors l\'utilisation de l\'API Twitter', 'statuses/user_timeline');
...@@ -44,8 +45,8 @@ export default class CrawlTwitter { ...@@ -44,8 +45,8 @@ export default class CrawlTwitter {
const dataForIndex = { const dataForIndex = {
tag: 'api', tag: 'api',
apiName: 'twitter', apiName: 'twitter',
domain: this.config.idPage, domain: this.data.idPage,
url: `https://twitter.com/${this.config.idPage}/status/${item.id_str}`, url: `https://twitter.com/${this.data.idPage}/status/${item.id_str}`,
}; };
...@@ -65,7 +66,7 @@ export default class CrawlTwitter { ...@@ -65,7 +66,7 @@ export default class CrawlTwitter {
this.listDataForIndex.push({ this.listDataForIndex.push({
index: { index: {
_index: Meteor.settings.private.elasticsearch.esIndex, _index: Meteor.settings.private.elasticsearch.esIndex,
_type: Meteor.settings.private.elasticsearch.esType, _type: this.config.userId,
_id: dataForIndex.url, _id: dataForIndex.url,
}, },
}); });
......
import { getConfig } from '/imports/api/config/methods';
export default class crawlGeneric {
constructor() {
this.config = getConfig();
}
}
import { Meteor } from 'meteor/meteor'; import { Meteor } from 'meteor/meteor';
import CrawlGeneric from '/imports/api/crawl/crawlGeneric';
import fs from 'fs'; import fs from 'fs';
import _ from 'lodash'; import _ from 'lodash';
import Files from '/imports/api/crawl/network/networkCollection'; import Files from '/imports/api/crawl/network/networkCollection';
export default class CrawlNetwork { export default class CrawlNetwork extends CrawlGeneric {
constructor() { constructor() {
super();
this.files = Files.find({ userId: Meteor.userId() }).fetch(); this.files = Files.find({ userId: Meteor.userId() }).fetch();
this.listDataForIndex = []; this.listDataForIndex = [];
console.log('init crawl network!'); console.log('init crawl network!');
...@@ -23,7 +25,7 @@ export default class CrawlNetwork { ...@@ -23,7 +25,7 @@ export default class CrawlNetwork {
this.listDataForIndex.push({ this.listDataForIndex.push({
index: { index: {
_index: Meteor.settings.private.elasticsearch.esIndex, _index: Meteor.settings.private.elasticsearch.esIndex,
_type: Meteor.settings.private.elasticsearch.esType, _type: this.config.userId,
_id: file.name, _id: file.name,
}, },
}); });
......
...@@ -8,7 +8,7 @@ export default new SimpleSchema({ ...@@ -8,7 +8,7 @@ export default new SimpleSchema({
type: String, type: String,
label: 'Url du site ou du sitemap à indexer :', label: 'Url du site ou du sitemap à indexer :',
}, },
config: { nameConfig: {
type: String, type: String,
label: 'Configuration à appliquer :', label: 'Configuration à appliquer :',
}, },
......
import url from 'url'; import url from 'url';
import _ from 'lodash';
import CrawlGeneric from '/imports/api/crawl/crawlGeneric';
import { Meteor } from 'meteor/meteor'; import { Meteor } from 'meteor/meteor';
import Crawler from 'crawler'; import Crawler from 'crawler';
import Sitemapper from 'sitemapper'; import Sitemapper from 'sitemapper';
import checkData from '/imports/utils/checkData'; import checkData from '/imports/utils/checkData';
export default class crawlWebsite { export default class crawlWebsite extends CrawlGeneric {
/** /**
* crawl list urls * crawl list urls
...@@ -13,13 +15,9 @@ export default class crawlWebsite { ...@@ -13,13 +15,9 @@ export default class crawlWebsite {
* @returns {Promise} * @returns {Promise}
*/ */
constructor(data) { constructor(data) {
super();
this.urlWebsite = data.urlWebsite; this.urlWebsite = data.urlWebsite;
this.config = { this.config.crawl = _.find(this.config.listConfig, n => n.domain === data.nameConfig);
domain: url.parse(this.urlWebsite).hostname,
forbiddenWord: [],
};
this.config = JSON.parse(data.config);
return this.start(); return this.start();
} }
...@@ -58,15 +56,17 @@ export default class crawlWebsite { ...@@ -58,15 +56,17 @@ export default class crawlWebsite {
res.$('a').each((index, a) => { res.$('a').each((index, a) => {
const urlHref = res.$(a).attr('href'); const urlHref = res.$(a).attr('href');
if (urlHref) { if (urlHref) {
const toQueueUrl = url.resolve(res.request.uri.href, res.$(a).attr('href')); let toQueueUrl = url.resolve(res.request.uri.href, res.$(a).attr('href'));
// check if same domain name // check if same domain name
if (toQueueUrl.includes(this.config.domain)) { toQueueUrl = toQueueUrl.replace('https', 'http');
// console.log(toQueueUrl, toQueueUrl.includes(this.config.crawl.domain));
if (toQueueUrl.includes(this.config.crawl.domain)) {
// clean url // clean url
// toQueueUrl = checkData.cleanUrl(toQueueUrl); // toQueueUrl = checkData.cleanUrl(toQueueUrl);
// check if already visited // check if already visited
if (!this.listUrlAlreadyVisited.includes(toQueueUrl)) { if (!this.listUrlAlreadyVisited.includes(toQueueUrl)) {
// check if url has forbidden word // check if url has forbidden word
if (!new RegExp(this.config.forbiddenWord.join('|')).test(toQueueUrl)) { if (!new RegExp(this.config.crawl.forbiddenWord.join('|')).test(toQueueUrl)) {
// check if url is good for crawl // check if url is good for crawl
if (checkData.checkCrawlUrl(toQueueUrl)) { if (checkData.checkCrawlUrl(toQueueUrl)) {
// add url in already visited urls // add url in already visited urls
...@@ -109,13 +109,12 @@ export default class crawlWebsite { ...@@ -109,13 +109,12 @@ export default class crawlWebsite {
* @param listPdf * @param listPdf
*/ */
parseData($, currentUrl, listPdf) { parseData($, currentUrl, listPdf) {
const _this = this;
const body = $('body'); const body = $('body');
body.html(checkData.cleanHtml(body.html())); body.html(checkData.cleanHtml(body.html()));
const title = checkData.cleanText($('title').text()); const title = checkData.cleanText($('title').text());
const dataForIndex = { const dataForIndex = {
tag: 'site', tag: 'site',
domain: this.config.domain, domain: this.config.crawl.domain,
title, title,
title_suggest: { title_suggest: {
input: title, input: title,
...@@ -128,8 +127,8 @@ export default class crawlWebsite { ...@@ -128,8 +127,8 @@ export default class crawlWebsite {
createdAt: new Date(), createdAt: new Date(),
}; };
if ($(_this.config.breadcrumb).text().length) { if ($(this.config.crawl.breadcrumb).text().length) {
dataForIndex.breadcrumb = checkData.slugText(checkData.cleanText($(_this.config.breadcrumb).text())); dataForIndex.breadcrumb = checkData.slugText(checkData.cleanText($(this.config.crawl.breadcrumb).text()));
} }
if ($('h1').text().length) { if ($('h1').text().length) {
...@@ -147,7 +146,7 @@ export default class crawlWebsite { ...@@ -147,7 +146,7 @@ export default class crawlWebsite {
this.listDataForIndex.push({ this.listDataForIndex.push({
index: { index: {
_index: Meteor.settings.private.elasticsearch.esIndex, _index: Meteor.settings.private.elasticsearch.esIndex,
_type: Meteor.settings.private.elasticsearch.esType, _type: this.config.userId,
_id: currentUrl, _id: currentUrl,
}, },
}); });
......
...@@ -9,35 +9,48 @@ import IndexNetwork from '/imports/api/indexation/server/indexNetwork'; ...@@ -9,35 +9,48 @@ import IndexNetwork from '/imports/api/indexation/server/indexNetwork';
import formWebsiteCrawlSchema from '/imports/api/crawl/website/formWebsiteCrawlSchema'; import formWebsiteCrawlSchema from '/imports/api/crawl/website/formWebsiteCrawlSchema';
import formApiCrawlSchema from '/imports/api/crawl/api/formApiCrawlSchema'; import formApiCrawlSchema from '/imports/api/crawl/api/formApiCrawlSchema';
export function initIndexElastic() {
this.unblock();
const index = new IndexGeneric();
return index.initElastic();
}
export function reIndexElastic() {
this.unblock();
const index = new IndexGeneric();
return index.reIndex();
}
export function indexWebsite(data) {
check(data, Object);
formWebsiteCrawlSchema.validate(data);
const index = new IndexWebsite();
return index.start(data);
}
export function indexApi(data) {
check(data, Object);
console.log(data);
formApiCrawlSchema.validate(data);
const index = new IndexApi();
return index.start(data);
}
export function indexNetwork() {
const index = new IndexNetwork();
return index.start();
}
export function removeFileNetwork(id) {
check(id, String);
Files.remove({ _id: id });
}
Meteor.methods({ Meteor.methods({
initIndexElastic() { initIndexElastic,
this.unblock(); reIndexElastic,
const index = new IndexGeneric(); indexWebsite,
return index.initElastic(); indexApi,
}, indexNetwork,
reIndexElastic() { removeFileNetwork,
this.unblock();
const index = new IndexGeneric();
return index.reIndex();
},
indexWebsite(data) {
check(data, Object);
formWebsiteCrawlSchema.validate(data);
const index = new IndexWebsite();
return index.start(data);
},
indexApi(data) {
check(data, Object);
formApiCrawlSchema.validate(data);
const index = new IndexApi();
return index.start(data);
},
indexNetwork() {
const indexNetwork = new IndexNetwork();
return indexNetwork.start();
},
removeFileNetwork(id) {
check(id, String);
Files.remove({ _id: id });
},
}); });
// analyser
exports.analyser = {
settings: {
analysis: {
filter: {
// suppression de ces mots pour diminuer le bruit
french_elision: {
type: 'elision',
articles_case: true,
articles: [
'l',
'm',
't',
'qu',
'n',
's',
'j',
'd',
'c',
'jusqu',
'quoiqu',
'lorsqu',
'puisqu',
],
},
// synonyme
french_synonym: {
type: 'synonym',
ignore_case: true,
expand: true,
synonyms: [
'gosse, enfant',
'pmi, protection maternelle et infantile',
],
},
// radical des mots
french_stemmer: {
type: 'stemmer',
language: 'light_french',
},
},
analyzer: {
// français elevé
french_heavy: {
tokenizer: 'icu_tokenizer',
filter: [
'french_elision',
'icu_folding',
'french_synonym',
'french_stemmer',
'lowercase',
'asciifolding',
],
},
// français léger
french_light: {
tokenizer: 'icu_tokenizer',
char_filter: [
'html_strip',
],
filter: [
'french_elision',
'icu_folding',
'lowercase',
'asciifolding',
],
},
// analyzer for url
url_analyzer: {
tokenizer: 'uax_url_email',
filter: [
'french_elision',
'icu_folding',
'lowercase',
],
},
},
},
},
};
// mapping
exports.mapping = {
properties: {
tag: {
type: 'keyword',
},
domain: {
type: 'keyword',
},
apiName: {
type: 'keyword',
},
title: {
type: 'text',
analyzer: 'french_light',
fields: {
stemmed: {
type: 'text',
analyzer: 'french_heavy',
},
},
},
title_suggest: {
type: 'completion',
analyzer: 'french_light',
max_input_length: 100,
},
description: {
type: 'text',
analyzer: 'french_light',
fields: {
stemmed: {
type: 'text',
analyzer: 'french_heavy',
},
},
},
body: {
type: 'text',
analyzer: 'french_light',
fields: {
stemmed: {
type: 'text',
analyzer: 'french_heavy',
},
},
},
html: {
type: 'text',
analyzer: 'french_light',
},
url: {
type: 'text',
analyzer: 'url_analyzer',
},
urlText: {
type: 'text',
analyzer: 'french_light',
fields: {
stemmed: {
type: 'text',
analyzer: 'french_heavy',
},
},
},
h1: {
type: 'text',
analyzer: 'french_light',
},
h2: {
type: 'text',
analyzer: 'french_light',
},
breadcrumb: {
type: 'text',
analyzer: 'french_light',
},
listPdf: {
type: 'text',
analyzer: 'url_analyzer',
},
createdAt: {
type: 'date',
},
},
};
...@@ -2,12 +2,16 @@ ...@@ -2,12 +2,16 @@
import { Meteor } from 'meteor/meteor'; import { Meteor } from 'meteor/meteor';
import elastic from '/imports/libs/elasticsearch/elasticsearch'; import elastic from '/imports/libs/elasticsearch/elasticsearch';
import configElastic from '/imports/libs/elasticsearch/elasticSearchConfig'; import configElastic from '/imports/libs/elasticsearch/elasticSearchConfig';
import { getConfig } from '/imports/api/config/methods';
const esIndex = Meteor.settings.private.elasticsearch.esIndex; const esIndex = Meteor.settings.private.elasticsearch.esIndex;
const esType = Meteor.settings.private.elasticsearch.esType;
export default class IndexGeneric { export default class IndexGeneric {
constructor() {
this.config = getConfig();
}
indexByBulk(data, hasFile) { indexByBulk(data, hasFile) {
return elastic.bulk(data, hasFile); return elastic.bulk(data, hasFile);
} }
...@@ -32,7 +36,7 @@ export default class IndexGeneric { ...@@ -32,7 +36,7 @@ export default class IndexGeneric {
const mapping = configElastic.mapping; const mapping = configElastic.mapping;
await elastic.initAnalyzer(esIndex, analyser); await elastic.initAnalyzer(esIndex, analyser);
await elastic.initMapping(esIndex, esType, mapping); await elastic.initMapping(esIndex, this.config.userId, mapping);
return elastic.createPipeline(); return elastic.createPipeline();
} }
} }
...@@ -7,7 +7,7 @@ ...@@ -7,7 +7,7 @@
<h4 class="text-center">Gestion de l'indexation des sites</h4> <h4 class="text-center">Gestion de l'indexation des sites</h4>
{{#autoForm id="formWebsiteCrawl" schema=formWebsiteCrawlSchema buttonContent="Indexer" type="method" meteormethod="indexWebsite" }} {{#autoForm id="formWebsiteCrawl" schema=formWebsiteCrawlSchema buttonContent="Indexer" type="method" meteormethod="indexWebsite" }}
{{> afQuickField name='urlWebsite'}} {{> afQuickField name='urlWebsite'}}
{{> afQuickField name='config' options=optionSelectConfig }} {{> afQuickField name='nameConfig' options=optionSelectConfig }}
<button type="submit" class="btn btn-success">Indexer</button> <button type="submit" class="btn btn-success">Indexer</button>
{{/autoForm}} {{/autoForm}}
......
...@@ -24,7 +24,7 @@ Template.siteIndexationTpl.helpers({ ...@@ -24,7 +24,7 @@ Template.siteIndexationTpl.helpers({
config.listConfig.forEach((item) => { config.listConfig.forEach((item) => {
options.push({ options.push({
label: item.domain, label: item.domain,
value: JSON.stringify(item), value: item.domain,
}); });
}); });
} }
......
...@@ -2,8 +2,7 @@ ...@@ -2,8 +2,7 @@
"private": { "private": {
"elasticsearch": { "elasticsearch": {
"host": "localhost:9201", "host": "localhost:9201",
"esIndex": "idsearch", "esIndex": "idsearch"
"esType": "haute-savoie"
} }
}, },
"public": { "public": {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment