Commit 148d1046 authored by Nacim Goura's avatar Nacim Goura

change crawl website

parent b668f68b
......@@ -56,6 +56,7 @@ export default class crawlWebsite extends CrawlGeneric {
'Googlebot/2.1 (+http://www.google.com/bot.html)',
],
callback: (error, res, done) => {
console.log(crawl.queueSize, res.options.uri);
if (error || res.statusCode !== 200) {
this.listDataError.push({
url: res.options.uri,
......@@ -67,27 +68,22 @@ export default class crawlWebsite extends CrawlGeneric {
res.$('a').each((index, a) => {
const urlHref = res.$(a).attr('href');
if (urlHref) {
let toQueueUrl = url.resolve(res.request.uri.href, res.$(a).attr('href'));
let toQueueUrl = url.resolve(res.options.uri, res.$(a).attr('href'));
// clean url
toQueueUrl = checkData.cleanUrl(toQueueUrl);
// check if same domain name
toQueueUrl = toQueueUrl.replace('https', 'http').replace(/\/$/, '');
// console.log(toQueueUrl, toQueueUrl.includes(this.config.domain));
if (toQueueUrl.includes(this.config.domain)) {
// clean url
// toQueueUrl = checkData.cleanUrl(toQueueUrl);
// check if already visited
if (!this.listUrlAlreadyVisited.includes(toQueueUrl)) {
// check if url has forbidden word
if (this.config.forbiddenWord.length === 0 || !new RegExp(this.config.forbiddenWord.join('|')).test(toQueueUrl)) {
// add url in already visited urls
this.listUrlAlreadyVisited.push(toQueueUrl);
// check if url is good for crawl
if (checkData.checkCrawlUrl(toQueueUrl)) {
// add url in already visited urls
this.listUrlAlreadyVisited.push(toQueueUrl);
// check if url is pdf
if (toQueueUrl.match(/(pdf)/)) {
listPdf.push(toQueueUrl);
} else {
crawl.queue(toQueueUrl);
}
crawl.queue(toQueueUrl);
} else {
listPdf.push(toQueueUrl);
}
}
}
......@@ -118,7 +114,8 @@ export default class crawlWebsite extends CrawlGeneric {
crawl.queue(urls);
crawl.on('drain', () => {
console.error(this.listDataError);
console.error(`Nombre de 404 : ${_.filter(this.listDataError, n => n.error === 404).length}`);
console.error(`Autre Erreur : ${_.filter(this.listDataError, n => n.error !== 404).length}`);
resolve(this.listDataForIndex);
});
});
......
const Crawler = require('simplecrawler');
const crawler = new Crawler('http://www.anap.fr/');
crawler.on('fetchcomplete', (queueItem, data, res) => {
console.log(queueItem.url);
});
crawler.start();
/* eslint quotes: off */
/* eslint quote-props: off */
import indexationElastic from '/imports/libs/elasticsearch/elasticsearch';
......@@ -24,50 +26,55 @@ export default class Search {
* common ( sépare les tokens les plus présents dans l’index des autres, et ne les utilise que pour améliorer la pertinence )
* fuzziness (permet une recherche même avec des fautes)
*/
params._source = ["title", "url"];
params.query = {
bool: {
must: [{
multi_match: {
query: term,
fuzziness: 'AUTO',
fields: [
'body.stemmed',
'title.stemmed',
],
"bool": {
"must": [
{
"multi_match": {
"query": term,
"fuzziness": "AUTO",
"fields": [
"title.stemmed",
"urlText.stemmed",
"description.stemmed",
],
"boost": 3,
},
},
}],
should: [{
multi_match: {
query: term,
fuzziness: 'AUTO',
fields: [
'description',
'description.stemmed',
'urlText',
'urlText.stemmed',
'title',
'url',
'h1',
'urlText',
'urlText.stemmed',
],
boost: 3.0,
],
"should": [
{
"multi_match": {
"query": term,
"fuzziness": "AUTO",
"fields": [
"description",
"urlText",
"title",
"h1",
"breadcrumb",
"urlText",
"urlText.stemmed",
],
"boost": 2,
},
},
}, {
multi_match: {
query: term,
fuzziness: 'AUTO',
fields: [
'body',
'breadcrumb',
'h1',
'h2',
'html',
],
boost: 1.0,
{
"multi_match": {
"query": term,
"fuzziness": "AUTO",
"fields": [
"body.stemmed",
"body",
"h2",
"html",
],
"boost": 1,
},
},
}],
minimum_should_match: 1,
],
"minimum_should_match": 1,
},
};
}
......
......@@ -68,14 +68,18 @@ const checkData = {
});
},
// url pour crawler
// url for crawl
checkCrawlUrl(url) {
return !url.match(/\.(jpeg|jpg|gif|png|js|css|ico|eot|svg|woff|ttf)/);
return !/\.(jpeg|jpg|gif|png|js|css|ico|eot|svg|woff|ttf|zip|rar|tar|pdf|xml|xlsx|docx)/.test(url);
},
// clean url (remove text after # and ?)
// clean url
cleanUrl(url) {
return url.replace(/(#.*)/g, '').replace(/\/{2,}/g, '/');
return url.trim() // remove space before and after
.replace('https', 'http') // replace https by http
.replace(/#.*/, '') // remove # and text after
.replace(/\/$/, '') // remove last slash
.toLowerCase(); // lowercase
},
};
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment