Commit 898bc444 authored by Nacim Goura's avatar Nacim Goura

add docker and elasticsearch

parent 64894d0f
node_modules
npm-debug.log
public/bower_components
/node_modules
.idea
yarn.lock
/public/bower_components
\ No newline at end of file
/public/bower_components
*.log
\ No newline at end of file
FROM node:boron
# Create app directory
RUN mkdir -p /usr/src/idsearch
WORKDIR /usr/src/idsearch
# Install all dependencies
COPY * /usr/src/idsearch/
RUN npm install
# Bundle app source
COPY . /usr/src/idsearch
EXPOSE 3000
CMD ["npm", "start"]
......@@ -2,9 +2,11 @@
const Sitemapper = require('sitemapper');
const Crawler = require("crawler");
const utf8 = require('utf8');
const chalk = require('chalk');
const stripTags = require('strip-tags');
const sanitizeHtml = require('sanitize-html');
const Q = require('q');
const searchSite = require('../model/search.server.models');
module.exports = {
......@@ -29,36 +31,41 @@ module.exports = {
* @param listSite
* @param term
*/
crawlUrl(listSite, term) {
indexUrl(listSite) {
let c = new Crawler({
maxConnections : 1000,
// This will be called for each crawled page
callback : function (error, res, done) {
if(error){
console.log(error);
console.log(chalk.red(error));
}else if(res.statusCode === 200) {
let $ = res.$;
let body = removeTags($);
let urlVisited = res.options.uri;
let titre = $("title").text().toLowerCase();
console.log("visited url : "+chalk.yellow(urlVisited));
console.log("titre : "+chalk.blue(titre));
console.log($('h1').text());
getInternalLinks($);
let obj = {
body : cleanHtml($),
title: $("title").text().toLowerCase(),
urlVisited: utf8.decode(urlVisited)
};
searchSite.index(obj)
.then(function(resp) {
console.log(resp);
}, function(err) {
console.log(err);
});
let isWordFound = searchForWord(body, term);
if(isWordFound) {
console.log('Word ' + chalk.green(term) + ' found ' + countOccurenceWord(body, term) + ' time on this page ');
}
}
done();
}
});
c.queue(listSite);
searchSite.initElasticsearch()
.then((success) => {
console.log(chalk.green(success));
c.queue(listSite);
}, function(err) {
console.log(chalk.red(err));
});
}
};
......@@ -66,9 +73,21 @@ module.exports = {
/**
* remove useless tags
*/
function removeTags($) {
function cleanHtml($) {
let bodyText = $('html > body').text().toLowerCase();
return stripTags(bodyText, ['link', 'script']);
return bodyText
.replace(/\n/g, " ")
.replace(/function.*}/g, " ");
/*return sanitizeHtml(bodyText, {
allowedTags: [ 'h3', 'h4', 'h5', 'h6', 'blockquote', 'p', 'a', 'ul', 'ol',
'nl', 'li', 'b', 'i', 'strong', 'em', 'strike', 'code', 'hr', 'br', 'div',
'table', 'thead', 'caption', 'tbody', 'tr', 'th', 'td', 'pre' ],
allowedAttributes: false,
selfClosing: [ 'img', 'br', 'hr', 'area', 'base', 'basefont', 'input', 'link', 'meta' ],
allowedSchemes: [ 'http', 'https', 'ftp', 'mailto' ],
allowedSchemesByTag: {},
allowProtocolRelative: true
});*/
}
/**
......
"use strict";
const elasticsearch = require('elasticsearch');
const Q = require('q');
const esClient = new elasticsearch.Client({
host: '127.0.0.1:9200',
const client = new elasticsearch.Client({
host: '127.0.0.1:9201',
log: 'error'
});
\ No newline at end of file
});
/**
* Check is index exists or not
*/
exports.indexExists = function (esIndex) {
let deferred = Q.defer();
client.indices.exists({
index: esIndex,
ignore: [404]
}).then((exist) => {
deferred.resolve(exist);
}, (err) => {
deferred.reject(err);
});
return deferred.promise;
};
/**
* Close index
*/
exports.indexClose = function (esIndex) {
let deferred = Q.defer();
client.indices.close({
index: esIndex,
ignore: [404]
}).then((resp) => {
deferred.resolve(resp);
}, (err) => {
deferred.reject(err);
});
return deferred.promise;
};
/**
* Open index
*/
exports.indexOpen = function (esIndex) {
let deferred = Q.defer();
client.indices.open({
index: esIndex,
ignore: [404]
}).then( (exists) => {
deferred.resolve(exists);
}, (err) => {
deferred.reject(err);
});
return deferred.promise;
};
/**
* Create index
*/
exports.indexCreate = function (esIndex, obj) {
let deferred = Q.defer();
client.indices.create({
index: esIndex,
body: obj
}).then((resp) => {
deferred.resolve(resp);
}, (err) => {
deferred.reject(err);
});
return deferred.promise;
};
/**
* Delete index
*/
exports.indexDelete = function (esIndex) {
let deferred = Q.defer();
client.indices.delete({
index: esIndex,
ignore: [404]
}).then((resp) => {
deferred.resolve(resp);
}, (err) => {
deferred.reject(err);
});
return deferred.promise;
};
/**
* Set index setting
*/
exports.indexSetSettings = function (esIndex, obj, callback) {
let deferred = Q.defer();
client.indices.putSettings({
index: esIndex,
body: obj
}).then((resp) => {
deferred.resolve(resp);
}, (err) => {
deferred.reject(err);
});
return deferred.promise;
};
/**
* Check is type exists or not
*/
exports.typeExists = function (esIndex, esType) {
let deferred = Q.defer();
client.indices.existsType({
index: esIndex,
type: esType
}).then((exists) => {
deferred.resolve(exists);
}, (err) => {
deferred.reject(err);
});
return deferred.promise;
};
/**
* Create mapping
*/
exports.mappingCreate = function (esIndex, esType, obj) {
let deferred = Q.defer();
client.indices.putMapping({
index: esIndex,
type: esType,
body: obj
}).then((resp) => {
deferred.resolve(resp);
}, (err) => {
deferred.reject(err);
});
return deferred.promise;
};
/**
* Delete mapping
*/
exports.mappingDelete = function (esIndex, esType) {
let deferred = Q.defer();
this.indexClose(esIndex)
.then(() => {
client.indices.deleteMapping({
index: esIndex,
type: esType
}).then((resp) => {
deferred.resolve(resp);
}, (err) => {
deferred.reject('Error in elasticsearch mappingDelete() : indexClose() ' + err);
});
}, (err) => {
deferred.reject('Error in elasticsearch mappingDelete() : indexClose() ' + err);
});
return deferred.promise;
};
/**
* Index object
*/
exports.index = function (esIndex, esType, id, obj) {
let deferred = Q.defer();
client.index({
index: esIndex,
type: esType,
id: id,
body: obj,
requestTimeout: 120000
}).then(function (resp) {
deferred.resolve(resp);
}, function(err) {
console.log('Error in elasticsearch index() : ' + err);
deferred.reject(err);
});
return deferred.promise;
};
/**
* Update object
*/
// exports.update = function (esIndex, esType, id, obj, callback) {
// client.update({
// index: esIndex,
// type: esType,
// id: id,
// body: obj
// }, function (err, resp) {
// if (err) {
// console.log('Error in elasticsearch update() : ' + err);
// }
//
// if (callback) {
// callback(err, resp);
// }
// });
// };
/**
* Delete object
*/
exports.delete = function (esIndex, esType, id) {
let deferred = Q.defer();
client.delete({
index: esIndex,
type: esType,
id: id
}).then(function (resp) {
deferred.resolve(resp);
}, function(err) {
deferred.reject(err);
});
return deferred.promise;
};
/**
* Bulk
*/
exports.bulk = function (body, callback) {
client.bulk({
body: body
}, function (err, resp) {
if (err) {
console.log('Error in elasticsearch bulk() : ' + err);
}
if (callback) {
callback(err, resp);
}
// client.indices.clearCache({index:["pple"]}, function(errorFullMessage, display, status) {
// if (errorFullMessage) {
// console.log('clear-cache', display.status, display.error.reason, '"' + display.error.index + '"');
// // } else {
// // console.log(display);
// }
// if (callback) {
// callback(err, resp);
// }
// });
});
};
/**
* Search object
*/
exports.search = function (esIndex, esType, params, callback) {
client.search({
index: esIndex,
type: esType,
body: params
}, function (err, exists) {
if (callback) {
callback(err, exists);
}
});
};
/**
* Init analyser
*/
exports.initAnalyser = function (esIndex, obj) {
let deferred = Q.defer();
this.indexExists(esIndex).then((exists) => {
if (exists) {
this.indexDelete(esIndex)
.then(() => {
this.indexCreate(esIndex, obj)
.then((resp) => {
deferred.resolve(resp);
}, (err) => {
deferred.reject(err);
});
}, (err) => {
deferred.reject('Error in elasticsearch initAnalyser() : indexDelete() ' + err);
});
} else {
this.indexCreate(esIndex, obj)
.then((resp) => {
deferred.resolve(resp);
}, (err) => {
deferred.reject(err);
});
}
}, (err) => {
deferred.reject('Error in elasticsearch initAnalyser() - indexExists() : ' + err);
});
return deferred.promise;
};
/**
* Init analyser
*/
exports.initMapping = function (esIndex, esType, obj) {
let deferred = Q.defer();
console.log('===================================================');
console.log('esIndex',esIndex);
console.log('esType',esType);
this.typeExists(esIndex, esType).then((exists) => {
console.log('exists', exists);
console.log('===================================================');
if (exists) {
this.mappingDelete(esIndex, esType).then(() => {
this.mappingCreate(esIndex, esType, obj)
.then(() => {
this.indexOpen(esIndex)
.then((resp) => {
deferred.resolve(resp);
}, (err) => {
deferred.reject(err);
});
}, (err) => {
deferred.reject('Error in elasticsearch initMapping() : mappingCreate() ' + err);
});
}, (err) => {
deferred.reject('Error in elasticsearch initMapping() : mappingDelete() ' + err);
});
} else {
this.indexClose(esIndex).then(() => {
this.mappingCreate(esIndex, esType, obj)
.then(() => {
this.indexOpen(esIndex)
.then((resp) => {
deferred.resolve(resp);
}, (err) => {
deferred.reject(err);
});
}, (err) => {
deferred.reject('Error in elasticsearch initMapping() : mappingCreate() ' + err);
});
}, (err) => {
deferred.reject('Error in elasticsearch initMapping() : indexClose() ' + err);
});
}
}, (err) => {
deferred.resolve('Error in elasticsearch initMapping() - typeExists() : ' + err);
});
return deferred.promise;
};
\ No newline at end of file
"use strict";
const elasticsearch = require('../library/elasticsearch/elasticsearch');
const Q = require('q');
const esIndex = 'haute-savoie',
esType = 'crawl';
exports.index = function(site) {
let deferred = Q.defer();
elasticsearch.index(esIndex, esType, site.urlVisited, site)
.then(function(resp) {
console.log(resp);
deferred.resolve(resp);
}, function(err) {
deferred.reject(err);
});
return deferred.promise;
};
exports.delete = function(site) {
let deferred = Q.defer();
elasticsearch.delete(esIndex, esType, site.urlVisited)
.then(function(resp) {
deferred.resolve(resp);
}, function(err) {
deferred.reject(err);
});
return deferred.promise;
};
exports.initElasticsearch = function () {
let deferred = Q.defer();
let analyser = {
settings: {
analysis: {
filter: {
"french_elision": {
"type": "elision",
"articles_case": true,
"articles": [
"l", "m", "t", "qu", "n", "s",
"j", "d", "c", "jusqu", "quoiqu",
"lorsqu", "puisqu"
]
},
"french_stop": {
"type": "stop",
"stopwords": "_french_"
},
"french_stemmer": {
"type": "stemmer",
"language": "french"
}
},
analyzer: {
analyzer_html: {
tokenizer: 'standard',
char_filter: [ "html_strip" ],
"filter": [
"french_elision",
"lowercase",
"french_stop",
"french_stemmer"
]
}
}
}
}
};
let mapping = {
properties: {
title: {
type: "text",
analyzer: "analyzer_html"
},
body: {
type: "text",
analyzer: "analyzer_html"
},
urlVisited: {
type: "text",
analyzer: "standard"
}
}
};
elasticsearch.initAnalyser(esIndex, analyser)
.then(function() {
console.log("init analyser done");
elasticsearch.initMapping(esIndex, esType, mapping)
.then(function () {
console.log("init mapping done");
deferred.resolve("init elasticsearch done");
}, function(err) {
deferred.reject(err);
});
}, function (err) {
deferred.reject(err);
});
return deferred.promise;
};
\ No newline at end of file
......@@ -18,8 +18,9 @@
"morgan": "~1.7.0",
"node-sass-middleware": "^0.11.0",
"q": "^1.4.1",
"sanitize-html": "^1.14.1",
"sitemapper": "^2.1.7",
"strip-tags": "^0.1.1"
"utf8": "^2.1.2"
},
"devDependencies": {
"eslint": "^3.16.1"
......
......@@ -5,9 +5,13 @@ const crawler = require('../controller/crawlerController');
module.exports = function(app) {
app.route('/').get(function(req, res) {
res.render('index', { title: 'Express' });
});
app.route('/index').get(function(req, res) {
crawler.getSiteMap('http://www.hautesavoie.fr/sitemap.xml')
.then(function(sites) {
crawler.crawlUrl(sites, 'test');
crawler.indexUrl(sites);
}, function(err) {
console.log(err);
});
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment