Commit 9e80fe06 authored by Nacim Goura's avatar Nacim Goura

change system of crawl website

parent 012d7696
import { getConfig } from '/imports/api/config/methods';
import elastic from '/imports/libs/elasticsearch/elasticsearch';
export default class crawlGeneric {
constructor() {
this.config = getConfig();
}
indexOne(esIndex, esType, id, obj) {
return elastic.index(esIndex, esType, id, obj);
}
indexByBulk(data, hasFile = false) {
return elastic.bulk(data, hasFile);
}
}
import url from 'url';
import _ from 'lodash';
import CrawlGeneric from '/imports/api/crawl/crawlGeneric';
import { Meteor } from 'meteor/meteor';
import Crawler from 'crawler';
import Sitemapper from 'sitemapper';
import checkData from '/imports/utils/checkData';
import jobCollection from '/imports/api/job/jobCollection';
export default class crawlWebsite extends CrawlGeneric {
/**
* crawl list urls
* @param data
* @param contextIndex
* @returns {Promise}
*/
constructor(data, contextIndex) {
super();
this.name = data.name;
this.contextIndex = contextIndex;
this.urlWebsite = data.urlWebsite;
this.userId = this.config.userId;
this.config = _.find(this.config.listConfig, n => n.domain === data.nameConfig);
return this.start();
}
async start() {
// init sitemapper
const sitemap = new Sitemapper();
let urls = [this.urlWebsite];
// si c'est un sitemap on récupère ces url
if (checkData.isSitemap(this.urlWebsite)) {
const { sites } = await sitemap.fetch(this.urlWebsite);
urls = sites;
}
return new Promise((resolve, reject) => {
if (urls && urls.length === 0) {
reject(new Meteor.Error('indexation', 'Aucune url fourni!'));
}
this.listDataError = [];
this.listDataForIndex = [];
this.listUrlAlreadyVisited = urls;
console.log(`${urls.length} à parser!`);
const crawl = new Crawler({
retries: 10,
skipDuplicates: true,
rotateUA: true,
userAgent: [
'Mozilla/5.0 (compatible; fr-crawler/1.1)',
'Googlebot/2.1 (+http://www.google.com/bot.html)',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.110 Safari/537.36',
'Mozilla/5.0 (Linux; Android 4.2.2; Nexus 7 Build/JDQ39) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.49 Safari/537.31',
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Win64; x64; Trident/6.0)',
'Mozilla/5.0 (Windows NT 6.0) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.107 Safari/535.1',
'Mozilla/5.0 (X11; Linux x86_64; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
'Mozilla/5.0 (X11; U; Linux i686; fr; rv:1.9b5) Gecko/2008041514 Firefox/3.0b5',
'Opera/9.80 (X11; Linux x86_64; U; Ubuntu; fr) Presto/2.10.289 Version/12.01',
'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)',
],
callback: (error, res, done) => {
console.log(crawl.queueSize, res.options.uri);
if (error || res.statusCode !== 200) {
this.listDataError.push({
url: res.options.uri,
error: error || res.statusCode,
});
} else if (res && res.$) {
const internalLink = [];
// get inside links
res.$('a').each((index, a) => {
const urlHref = res.$(a).attr('href');
if (urlHref) {
let toQueueUrl = url.resolve(res.options.uri, res.$(a).attr('href'));
// clean url
toQueueUrl = checkData.cleanUrl(toQueueUrl);
// check if same domain name
if (toQueueUrl.includes(this.config.domain) && !toQueueUrl.includes('recherche')) {
// check if already visited
if (!this.listUrlAlreadyVisited.includes(toQueueUrl)) {
// check if url has forbidden word
if (this.config.forbiddenWord.length === 0 || !new RegExp(this.config.forbiddenWord.join('|')).test(toQueueUrl)) {
// add url in already visited urls
this.listUrlAlreadyVisited.push(toQueueUrl);
// check if url is good for crawl
if (checkData.checkCrawlUrl(toQueueUrl)) {
crawl.queue(toQueueUrl);
} else {
internalLink.push(toQueueUrl);
}
}
}
}
}
});
// if url has not parameter => parse data
if (!/(#.*|\?.*)/g.test(res.options.uri)) {
const urlParse = res.options.uri.replace('https', 'http').replace(/\/$/, '');
this.parseData(res.$, urlParse, internalLink, res.headers.date)
.then(() => {
jobCollection.update({ name: this.name },
{
$inc: {
numberIndexed: 1,
},
},
);
}).catch((err) => {
console.log(err);
});
}
}
done();
},
});
crawl.queue(urls);
crawl.on('drain', () => {
console.error(`Nombre de 404 : ${_.filter(this.listDataError, n => n.error === 404).length}`);
console.error(`Autre Erreur : ${_.filter(this.listDataError, n => n.error !== 404).length}`);
resolve(this.listDataForIndex);
});
});
}
/**
* map data
* @param $
* @param currentUrl
* @param internalLink
* @param headerDate
*/
async parseData($, currentUrl, internalLink, headerDate) {
const body = $('body');
if (this.config.excludeElement) {
$(this.config.excludeElement).remove();
}
body.html(checkData.cleanHtml(body.html()));
const title = checkData.cleanText($('title').text());
const description = `${$('meta[name=description]').attr('content')}
${$('meta[property="og:title"]').attr('content')}
${$('meta[name=title]').attr('content')}
${$('meta[property="video:actor"]').attr('content')}
${$('meta[property="video:director"]').attr('content')}
${$('meta[property="og:description"]').attr('content')} `;
const dataForIndex = {
tag: 'website',
jobName: this.name,
domain: this.config.domain,
title,
title_suggest: {
// replace - and _ and multiple space for autocompletion
input: title.replace(/[-_]/g, ' ').replace(/[^\S]{2,}/g, ' ').trim(),
},
description: checkData.cleanText(description),
body: checkData.cleanText(body.text()),
image: $('meta[property="og:image"]').attr('content'),
html: body.html(),
urlText: checkData.cleanText(decodeURI(currentUrl)).replace(/http|www|html/g, '').replace(/\.|-/g, ' '),
url: decodeURI(currentUrl),
createdAt: new Date(headerDate),
};
if ($(this.config.breadcrumb).text().length) {
dataForIndex.breadcrumb = checkData.cleanText($(this.config.breadcrumb).text());
}
if ($('h1').text().length) {
dataForIndex.h1 = checkData.cleanText($('h1').text());
}
if ($('h2').text().length) {
dataForIndex.h2 = checkData.cleanText($('h2').text());
}
if (internalLink && internalLink.length) {
dataForIndex.internalLink = internalLink.join(' ');
}
const _index = this.userId;
const _type = 'website';
const _id = currentUrl;
this.listDataForIndex.push(dataForIndex);
return this.contextIndex.indexOne(_index, _type, _id, dataForIndex);
}
}
......@@ -11,15 +11,14 @@ export default class IndexWebsite extends IndexGeneric {
*/
async start(data) {
try {
const context = this;
const dataToIndex = await new CrawlWebsite(data, context);
const result = await new CrawlWebsite(data);
/**
* dataToIndex index website page
* false no file
*/
// const resultIndex = await this.indexByBulk(dataToIndex, false);
return {
message: `Indexation de ${dataToIndex.length} liens pour ${data.urlWebsite} fini avec succès!`,
message: `Indexation de ${result.numberIndexed} liens pour ${data.urlWebsite} fini avec succès!`,
};
} catch (error) {
throw error;
......
......@@ -61,6 +61,7 @@ export default class Search {
"urlText",
"h1",
"breadcrumb",
"internalLink",
],
"boost": 2,
},
......@@ -73,7 +74,7 @@ export default class Search {
"body",
"h2",
"html",
"listPdf",
"internalLink.stemmed",
],
"boost": 1,
},
......
......@@ -219,10 +219,20 @@ exports.mapping = {
type: 'text',
analyzer: 'french_light',
},
listPdf: {
internalLink: {
type: 'text',
analyzer: 'url_analyzer',
},
internalLinkText: {
type: 'text',
analyzer: 'french_light',
fields: {
stemmed: {
type: 'text',
analyzer: 'french_heavy',
},
},
},
createdAt: {
type: 'date',
},
......
......@@ -32,7 +32,7 @@ export default {
* @param hasFile
* @returns promise
*/
bulk(body, hasFile) {
bulk(body, hasFile = false) {
const params = {
body,
};
......
......@@ -33,7 +33,6 @@ Template.accountActionTable.events({
Template.jobActionTable.events({
'click .start-job': function () {
console.log(this);
Meteor.callPromise('startJob', this._id)
.then((result) => {
displayNotif({
......
......@@ -7,9 +7,9 @@
<div class="panel-body">
<div class="row">
<select name="selectDateChart" id="selectDateChart" class="form-control">
<option value="day">Jour</option>
<option value="week">Semaine</option>
<option value="month">Mois</option>
<option value="day">Aujourd'hui</option>
<option value="week">Cette semaine</option>
<option value="month">Ce mois</option>
</select>
<div class="col-md-6">
<canvas id="termFrequencyChart"></canvas>
......
......@@ -49,7 +49,7 @@ function defineFrequencyTerm(stats) {
yAxes: [{
ticks: {
beginAtZero: true,
callback(value) { if (value % 1 === 0) { return value; } },
// callback(value) { if (value % 1 === 0) { return value; } },
},
}],
},
......@@ -101,14 +101,23 @@ function defineResponseTime(stats, filter) {
});
}
function renderGraph(filter = 'day') {
const stats = statCollection.find({
createdAt: {
$gte: moment().startOf(filter).toDate(),
$lte: moment().endOf(filter).toDate(),
},
}).fetch();
if (stats) {
defineResponseTime(stats, filter);
defineFrequencyTerm(stats);
}
}
Template.statTpl.hooks({
rendered() {
this.autorun(() => {
const stats = statCollection.find({}).fetch();
if (stats && stats.length) {
defineResponseTime(stats, 'day');
defineFrequencyTerm(stats);
}
renderGraph();
});
},
});
......@@ -117,16 +126,7 @@ Template.statTpl.events({
'change #selectDateChart': (event) => {
const filter = event.currentTarget.value;
if (filter) {
const stats = statCollection.find({
createdAt: {
$gte: moment().startOf(filter).toDate(),
$lte: moment().endOf(filter).toDate(),
},
}).fetch();
if (stats && stats.length) {
defineResponseTime(stats, filter);
defineFrequencyTerm(stats);
}
renderGraph(filter);
}
},
});
......
import SimpleSchema from 'simpl-schema';
import sanitizeHtml from 'sanitize-html';
import _ from 'lodash';
import detergent from 'detergent';
import unfancy from 'string-unfancy';
const checkData = {
isUrl(str) {
const regexp = new RegExp(SimpleSchema.RegEx.Url);
return regexp.test(str);
const detergent = require('detergent');
module.exports = {
/**
* @param {string} url
* @returns {boolean}
*/
hasGoodExtension(url) {
return !/\.(jpeg|jpg|gif|png|js|css|ico|eot|svg|woff|ttf|zip|rar|tar|pdf|xml|xlsx|docx|mp3)/i.test(url);
},
/**
* @param {string} url
* @param {string} forbiddenWord
* @returns {boolean}
*/
hasNotForbiddenWord(url, forbiddenWord = '') {
forbiddenWord = forbiddenWord.replace(/[,]/g, '|');
return !new RegExp(forbiddenWord).test(url);
},
isSitemap(str) {
return !!(this.isUrl(str) && _.includes(['xml', 'txt'], _.last(_.split(str, '.'))) && _.includes(str, 'sitemap'));
/**
* @param {string} url
* @returns {boolean}
*/
isSitemap(url) {
return /sitemap/.test(url);
},
cleanText(str) {
// enleve saut de ligne et slash
// let cleanStr = _.replace(str, /[\n\\/]/gm, ' ');
// met un espace avant une majuscule
// cleanStr = _.replace(cleanStr, /([A-Z])/gm, ' $1');
// enleve les multiples espace
// cleanStr = _.replace(cleanStr, /[^\S]{2,}/gm, ' ');
let resultText = detergent(str, {
removeWidows: false, // replace the last space in paragraph with &nbsp;
convertEntities: false, // encode all non-ASCII chars
convertDashes: false, // typographically-correct the n/m-dashes
convertApostrophes: false, // typographically-correct the apostrophes
replaceLineBreaks: false, // replace all line breaks with BR's
removeLineBreaks: true, // put everything on one line
useXHTML: false, // add closing slashes on BR's
removeSoftHyphens: true, // remove character which encodes to &#173; or &shy;
dontEncodeNonLatin: false, // skip non-latin character encoding
keepBoldEtc: false, // any bold, strong, i or em tags are stripped of attributes and retained
addMissingSpaces: true, // adds missing spaces after dots/colons/semicolons, unless it's URL
/**
* Remove html tag, multiple space...
* @param {string} html
*/
cleanText(html) {
// Clean html data
let result = detergent(html, {
removeWidows: false, // Replace the last space in paragraph with &nbsp;
convertEntities: false, // Encode all non-ASCII chars
convertDashes: false, // Typographically-correct the n/m-dashes
convertApostrophes: false, // Typographically-correct the apostrophes
replaceLineBreaks: true, // Replace all line breaks with BR's
removeLineBreaks: false, // Put everything on one line
useXHTML: false, // Add closing slashes on BR's
removeSoftHyphens: true, // Remove character which encodes to &#173; or &shy;
dontEncodeNonLatin: true, // Skip non-latin character encoding
keepBoldEtc: false, // Any bold, strong, i or em tags are stripped of attributes and retained
addMissingSpaces: true, // Adds missing spaces after dots/colons/semicolons, unless it's URL
});
resultText = resultText.replace(/[Æ]/g, 'ae'); // .replace(/[\n\\/]/gm, ' ').replace(/[<br>]/gm, ' ').replace(/[^\S]{2,}/gm, ' ');
/**
* ajoute un espace avant un mot commencant par une majuscule pour que
* les mots ne soit pas collés sauf s'il s'agit d'un guillemet
* Replace <br> by space because detergent replace line break by <br>
* Remove noise and replace multiple space by one
*/
resultText = resultText.replace(/([^A-Z'`])([A-Z])/g, '$1 $2');
// remplace les caractères spéciaux
resultText = resultText.replace(/[\/<>_():\\«»"]/g, ' ');
// enleve undefined ou null
resultText = resultText.replace(/undefined|null/g, ' ');
// remplace les espaces et les retours à la ligne par un espace
resultText = resultText.replace(/(\s{2,})/g, ' ');
return unfancy(resultText);
result = result.replace(/<br>/gi, ' ')
.replace(/undefined|null/g, '')
.replace(/[–#"/|\\-]/g, ' ')
.replace(/[^\S]{2,}/gm, ' ');
return result;
},
cleanHtml(html) {
return sanitizeHtml(html, {
allowedTags: ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'blockquote', 'p', 'a', 'ul', 'ol',
'nl', 'li', 'b', 'i', 'strong', 'em', 'strike', 'code', 'hr', 'br', 'div',
'table', 'thead', 'caption', 'tbody', 'tr', 'th', 'td', 'pre'],
allowedAttributes: [],
allowedClasses: {
div: ['breadcrumb'],
},
selfClosing: ['img', 'br', 'hr', 'area', 'base', 'basefont', 'input', 'link', 'meta'],
allowedSchemes: ['http', 'https', 'ftp', 'mailto'],
allowedSchemesByTag: {},
allowProtocolRelative: true,
});
},
// url for crawl
checkCrawlUrl(url) {
return !/\.(jpeg|jpg|gif|png|js|css|ico|eot|svg|woff|ttf|zip|rar|tar|pdf|xml|xlsx|docx|mp3|doc)/.test(url);
},
// clean url
cleanUrl(url) {
return url.trim() // remove space before and after
.replace('https', 'http') // replace https by http
.replace(/[#?].*/, '') // remove # and text after
.replace(/\/$/, '') // remove last slash
.toLowerCase(); // lowercase
convertUrlToText(url) {
return this.cleanText(decodeURI(url).replace(/http|www|html/g, ' ').replace(/\.|-/g, ' '));
},
};
export default checkData;
{
"name": "meteorSearch",
"author": "Nacim",
"private": true,
"scripts": {
"start": "meteor --settings settings.json",
"test": "meteor test --driver-package practicalmeteor:mocha --port 3001",
"eslint": "eslint .; exit 0"
},
"dependencies": {
"awesomplete": "^1.1.2",
"babel-runtime": "^6.26.0",
"bcrypt": "^1.0.2",
"bootstrap-sass": "^3.3.7",
"chart.js": "^2.6.0",
"crawler": "^1.0.5",
"datatables.net-bs": "^1.10.15",
"detergent": "^2.30.1",
"elasticsearch": "^13.3.1",
"izitoast": "^1.1.5",
"jquery": "^1.11.2",
"lodash": "^4.17.4",
"meteor-node-stubs": "~0.2.11",
"moment": "^2.18.1",
"sanitize-html": "^1.14.1",
"simpl-schema": "^0.3.2",
"sitemapper": "^2.1.13",
"string-unfancy": "^2.0.1",
"sweetalert2": "^6.6.9",
"twitter": "^1.7.1"
},
"devDependencies": {
"@meteorjs/eslint-config-meteor": "^1.0.5",
"babel-eslint": "^7.2.3",
"eslint": "^4.5.0",
"eslint-config-airbnb": "^15.1.0",
"eslint-import-resolver-meteor": "^0.4.0",
"eslint-plugin-import": "^2.7.0",
"eslint-plugin-jsx-a11y": "^5.1.1",
"eslint-plugin-meteor": "^4.1.4",
"eslint-plugin-promise": "^3.5.0",
"eslint-plugin-react": "^7.3.0"
}
"name": "meteorSearch",
"version": "0.0.1",
"description": "Awesome boilerplate with Meteor and ElasticSearch",
"author": {
"name": "Nacim Goura",
"email": "ngoura@idfr.net",
"url": "nacimgoura.fr"
},
"private": true,
"scripts": {
"start": "meteor --settings settings.json",
"test": "meteor test --driver-package practicalmeteor:mocha --port 3001",
"eslint": "eslint .; exit 0"
},
"keywords": [
"meteor",
"elasticsearch",
"boilerplate",
"javascript",
"nodejs",
"search"
],
"dependencies": {
"awesomplete": "^1.1.2",
"babel-runtime": "^6.26.0",
"bcrypt": "^1.0.2",
"bootstrap-sass": "^3.3.7",
"chart.js": "^2.6.0",
"cheerio": "^1.0.0-rc.2",
"datatables.net-bs": "^1.10.15",
"detergent": "^2.30.1",
"elasticsearch": "^13.3.1",
"izitoast": "^1.1.5",
"jquery": "^1.11.2",
"lodash": "^4.17.4",
"meteor-node-stubs": "~0.2.11",
"moment": "^2.18.1",
"sanitize-html": "^1.14.1",
"simpl-schema": "^0.3.2",
"simplecrawler": "^1.1.5",
"sitemapper": "^2.1.13",
"sweetalert2": "^6.6.9",
"twitter": "^1.7.1"
},
"devDependencies": {
"@meteorjs/eslint-config-meteor": "^1.0.5",
"babel-eslint": "^7.2.3",
"eslint": "^4.5.0",
"eslint-config-airbnb": "^15.1.0",
"eslint-import-resolver-meteor": "^0.4.0",
"eslint-plugin-import": "^2.7.0",
"eslint-plugin-jsx-a11y": "^5.1.1",
"eslint-plugin-meteor": "^4.1.4",
"eslint-plugin-promise": "^3.5.0",
"eslint-plugin-react": "^7.3.0"
}
}
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment