Commit fef1fde1 authored by Nacim Goura's avatar Nacim Goura

finish demo

parent fc0518f2
......@@ -38,10 +38,12 @@ SimpleSchema.configCollection = new SimpleSchema({
'listConfig.$.breadcrumb': {
type: String,
label: 'Element du breadcrumb',
required: false,
},
'listConfig.$.forbiddenWordString': {
type: String,
label: 'Mot non indexable (à séparer par une virgule)',
required: false,
autoform: {
class: 'forbiddenWordWebsite',
},
......
import CrawlFacebook from './crawlFacebook';
import CrawlTwitter from './crawlTwitter';
import CrawlGeneric from '/imports/api/crawl/crawlGeneric';
import CrawlFacebook from '/imports/api/crawl/api/server/crawlFacebook';
import CrawlTwitter from '/imports/api/crawl/api/server/crawlTwitter';
export default class crawlApi {
export default class crawlApi extends CrawlGeneric {
constructor(config) {
switch (config.type) {
constructor(data) {
super();
switch (data.type) {
case 'facebook':
return new CrawlFacebook(config);
return new CrawlFacebook(this.config, data);
case 'twitter':
return new CrawlTwitter(config);
return new CrawlTwitter(this.config, data);
default:
throw new Meteor.Error('Error', 'Aucun type pour l\'API indiqué');
}
......
......@@ -5,13 +5,13 @@ import checkData from '/imports/utils/checkData';
export default class CrawlFacebook {
constructor(config) {
constructor(data, config) {
this.config = config;
console.log('crawl Facebook');
this.listDataForIndex = [];
if (config.content) {
this.content = JSON.parse(config.content);
if (data.content) {
this.content = JSON.parse(data.content);
return this.start();
}
throw new Meteor.Error('Error', 'aucune donnée Facebook!');
......@@ -71,7 +71,7 @@ export default class CrawlFacebook {
this.listDataForIndex.push({
index: {
_index: Meteor.settings.private.elasticsearch.esIndex,
_type: Meteor.settings.private.elasticsearch.esType,
_type: this.config.userId,
_id: dataForIndex.url,
},
});
......
......@@ -6,9 +6,10 @@ import checkData from '/imports/utils/checkData';
export default class CrawlTwitter {
constructor(config) {
constructor(config, data) {
console.log('crawl Twittter');
this.config = config;
this.data = data;
this.listDataForIndex = [];
this.client = new Twitter({
......@@ -27,7 +28,7 @@ export default class CrawlTwitter {
*/
async start() {
try {
const tweets = await this.client.get('statuses/user_timeline', { screen_name: this.config.idPage });
const tweets = await this.client.get('statuses/user_timeline', { screen_name: this.data.idPage });
return this.parseData(tweets);
} catch (e) {
throw new Meteor.Error('Error', 'Erreur lors l\'utilisation de l\'API Twitter', 'statuses/user_timeline');
......@@ -44,8 +45,8 @@ export default class CrawlTwitter {
const dataForIndex = {
tag: 'api',
apiName: 'twitter',
domain: this.config.idPage,
url: `https://twitter.com/${this.config.idPage}/status/${item.id_str}`,
domain: this.data.idPage,
url: `https://twitter.com/${this.data.idPage}/status/${item.id_str}`,
};
......@@ -65,7 +66,7 @@ export default class CrawlTwitter {
this.listDataForIndex.push({
index: {
_index: Meteor.settings.private.elasticsearch.esIndex,
_type: Meteor.settings.private.elasticsearch.esType,
_type: this.config.userId,
_id: dataForIndex.url,
},
});
......
import { getConfig } from '/imports/api/config/methods';
export default class crawlGeneric {
constructor() {
this.config = getConfig();
}
}
import { Meteor } from 'meteor/meteor';
import CrawlGeneric from '/imports/api/crawl/crawlGeneric';
import fs from 'fs';
import _ from 'lodash';
import Files from '/imports/api/crawl/network/networkCollection';
export default class CrawlNetwork {
export default class CrawlNetwork extends CrawlGeneric {
constructor() {
super();
this.files = Files.find({ userId: Meteor.userId() }).fetch();
this.listDataForIndex = [];
console.log('init crawl network!');
......@@ -23,7 +25,7 @@ export default class CrawlNetwork {
this.listDataForIndex.push({
index: {
_index: Meteor.settings.private.elasticsearch.esIndex,
_type: Meteor.settings.private.elasticsearch.esType,
_type: this.config.userId,
_id: file.name,
},
});
......
......@@ -8,7 +8,7 @@ export default new SimpleSchema({
type: String,
label: 'Url du site ou du sitemap à indexer :',
},
config: {
nameConfig: {
type: String,
label: 'Configuration à appliquer :',
},
......
import url from 'url';
import _ from 'lodash';
import CrawlGeneric from '/imports/api/crawl/crawlGeneric';
import { Meteor } from 'meteor/meteor';
import Crawler from 'crawler';
import Sitemapper from 'sitemapper';
import checkData from '/imports/utils/checkData';
export default class crawlWebsite {
export default class crawlWebsite extends CrawlGeneric {
/**
* crawl list urls
......@@ -13,13 +15,9 @@ export default class crawlWebsite {
* @returns {Promise}
*/
constructor(data) {
super();
this.urlWebsite = data.urlWebsite;
this.config = {
domain: url.parse(this.urlWebsite).hostname,
forbiddenWord: [],
};
this.config = JSON.parse(data.config);
this.config.crawl = _.find(this.config.listConfig, n => n.domain === data.nameConfig);
return this.start();
}
......@@ -58,15 +56,17 @@ export default class crawlWebsite {
res.$('a').each((index, a) => {
const urlHref = res.$(a).attr('href');
if (urlHref) {
const toQueueUrl = url.resolve(res.request.uri.href, res.$(a).attr('href'));
let toQueueUrl = url.resolve(res.request.uri.href, res.$(a).attr('href'));
// check if same domain name
if (toQueueUrl.includes(this.config.domain)) {
toQueueUrl = toQueueUrl.replace('https', 'http');
// console.log(toQueueUrl, toQueueUrl.includes(this.config.crawl.domain));
if (toQueueUrl.includes(this.config.crawl.domain)) {
// clean url
// toQueueUrl = checkData.cleanUrl(toQueueUrl);
// check if already visited
if (!this.listUrlAlreadyVisited.includes(toQueueUrl)) {
// check if url has forbidden word
if (!new RegExp(this.config.forbiddenWord.join('|')).test(toQueueUrl)) {
if (!new RegExp(this.config.crawl.forbiddenWord.join('|')).test(toQueueUrl)) {
// check if url is good for crawl
if (checkData.checkCrawlUrl(toQueueUrl)) {
// add url in already visited urls
......@@ -109,13 +109,12 @@ export default class crawlWebsite {
* @param listPdf
*/
parseData($, currentUrl, listPdf) {
const _this = this;
const body = $('body');
body.html(checkData.cleanHtml(body.html()));
const title = checkData.cleanText($('title').text());
const dataForIndex = {
tag: 'site',
domain: this.config.domain,
domain: this.config.crawl.domain,
title,
title_suggest: {
input: title,
......@@ -128,8 +127,8 @@ export default class crawlWebsite {
createdAt: new Date(),
};
if ($(_this.config.breadcrumb).text().length) {
dataForIndex.breadcrumb = checkData.slugText(checkData.cleanText($(_this.config.breadcrumb).text()));
if ($(this.config.crawl.breadcrumb).text().length) {
dataForIndex.breadcrumb = checkData.slugText(checkData.cleanText($(this.config.crawl.breadcrumb).text()));
}
if ($('h1').text().length) {
......@@ -147,7 +146,7 @@ export default class crawlWebsite {
this.listDataForIndex.push({
index: {
_index: Meteor.settings.private.elasticsearch.esIndex,
_type: Meteor.settings.private.elasticsearch.esType,
_type: this.config.userId,
_id: currentUrl,
},
});
......
......@@ -9,35 +9,48 @@ import IndexNetwork from '/imports/api/indexation/server/indexNetwork';
import formWebsiteCrawlSchema from '/imports/api/crawl/website/formWebsiteCrawlSchema';
import formApiCrawlSchema from '/imports/api/crawl/api/formApiCrawlSchema';
Meteor.methods({
initIndexElastic() {
export function initIndexElastic() {
this.unblock();
const index = new IndexGeneric();
return index.initElastic();
},
reIndexElastic() {
}
export function reIndexElastic() {
this.unblock();
const index = new IndexGeneric();
return index.reIndex();
},
indexWebsite(data) {
}
export function indexWebsite(data) {
check(data, Object);
formWebsiteCrawlSchema.validate(data);
const index = new IndexWebsite();
return index.start(data);
},
indexApi(data) {
}
export function indexApi(data) {
check(data, Object);
console.log(data);
formApiCrawlSchema.validate(data);
const index = new IndexApi();
return index.start(data);
},
indexNetwork() {
const indexNetwork = new IndexNetwork();
return indexNetwork.start();
},
removeFileNetwork(id) {
}
export function indexNetwork() {
const index = new IndexNetwork();
return index.start();
}
export function removeFileNetwork(id) {
check(id, String);
Files.remove({ _id: id });
},
}
Meteor.methods({
initIndexElastic,
reIndexElastic,
indexWebsite,
indexApi,
indexNetwork,
removeFileNetwork,
});
// analyser
exports.analyser = {
settings: {
analysis: {
filter: {
// suppression de ces mots pour diminuer le bruit
french_elision: {
type: 'elision',
articles_case: true,
articles: [
'l',
'm',
't',
'qu',
'n',
's',
'j',
'd',
'c',
'jusqu',
'quoiqu',
'lorsqu',
'puisqu',
],
},
// synonyme
french_synonym: {
type: 'synonym',
ignore_case: true,
expand: true,
synonyms: [
'gosse, enfant',
'pmi, protection maternelle et infantile',
],
},
// radical des mots
french_stemmer: {
type: 'stemmer',
language: 'light_french',
},
},
analyzer: {
// français elevé
french_heavy: {
tokenizer: 'icu_tokenizer',
filter: [
'french_elision',
'icu_folding',
'french_synonym',
'french_stemmer',
'lowercase',
'asciifolding',
],
},
// français léger
french_light: {
tokenizer: 'icu_tokenizer',
char_filter: [
'html_strip',
],
filter: [
'french_elision',
'icu_folding',
'lowercase',
'asciifolding',
],
},
// analyzer for url
url_analyzer: {
tokenizer: 'uax_url_email',
filter: [
'french_elision',
'icu_folding',
'lowercase',
],
},
},
},
},
};
// mapping
exports.mapping = {
properties: {
tag: {
type: 'keyword',
},
domain: {
type: 'keyword',
},
apiName: {
type: 'keyword',
},
title: {
type: 'text',
analyzer: 'french_light',
fields: {
stemmed: {
type: 'text',
analyzer: 'french_heavy',
},
},
},
title_suggest: {
type: 'completion',
analyzer: 'french_light',
max_input_length: 100,
},
description: {
type: 'text',
analyzer: 'french_light',
fields: {
stemmed: {
type: 'text',
analyzer: 'french_heavy',
},
},
},
body: {
type: 'text',
analyzer: 'french_light',
fields: {
stemmed: {
type: 'text',
analyzer: 'french_heavy',
},
},
},
html: {
type: 'text',
analyzer: 'french_light',
},
url: {
type: 'text',
analyzer: 'url_analyzer',
},
urlText: {
type: 'text',
analyzer: 'french_light',
fields: {
stemmed: {
type: 'text',
analyzer: 'french_heavy',
},
},
},
h1: {
type: 'text',
analyzer: 'french_light',
},
h2: {
type: 'text',
analyzer: 'french_light',
},
breadcrumb: {
type: 'text',
analyzer: 'french_light',
},
listPdf: {
type: 'text',
analyzer: 'url_analyzer',
},
createdAt: {
type: 'date',
},
},
};
......@@ -2,12 +2,16 @@
import { Meteor } from 'meteor/meteor';
import elastic from '/imports/libs/elasticsearch/elasticsearch';
import configElastic from '/imports/libs/elasticsearch/elasticSearchConfig';
import { getConfig } from '/imports/api/config/methods';
const esIndex = Meteor.settings.private.elasticsearch.esIndex;
const esType = Meteor.settings.private.elasticsearch.esType;
export default class IndexGeneric {
constructor() {
this.config = getConfig();
}
indexByBulk(data, hasFile) {
return elastic.bulk(data, hasFile);
}
......@@ -32,7 +36,7 @@ export default class IndexGeneric {
const mapping = configElastic.mapping;
await elastic.initAnalyzer(esIndex, analyser);
await elastic.initMapping(esIndex, esType, mapping);
await elastic.initMapping(esIndex, this.config.userId, mapping);
return elastic.createPipeline();
}
}
......@@ -7,7 +7,7 @@
<h4 class="text-center">Gestion de l'indexation des sites</h4>
{{#autoForm id="formWebsiteCrawl" schema=formWebsiteCrawlSchema buttonContent="Indexer" type="method" meteormethod="indexWebsite" }}
{{> afQuickField name='urlWebsite'}}
{{> afQuickField name='config' options=optionSelectConfig }}
{{> afQuickField name='nameConfig' options=optionSelectConfig }}
<button type="submit" class="btn btn-success">Indexer</button>
{{/autoForm}}
......
......@@ -24,7 +24,7 @@ Template.siteIndexationTpl.helpers({
config.listConfig.forEach((item) => {
options.push({
label: item.domain,
value: JSON.stringify(item),
value: item.domain,
});
});
}
......
......@@ -2,8 +2,7 @@
"private": {
"elasticsearch": {
"host": "localhost:9201",
"esIndex": "idsearch",
"esType": "haute-savoie"
"esIndex": "idsearch"
}
},
"public": {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment