Commit 79120e83 authored by Nacim Goura's avatar Nacim Goura

init crawl api facebook and modif form

parent 537809df
......@@ -30,6 +30,7 @@
"rules": {
"indent": ["error", 4, { "SwitchCase": 1 }],
"import/extensions": ["off", "never"],
"import/no-absolute-path": "off",
"import/no-extraneous-dependencies": "off",
"no-underscore-dangle": "off",
"class-methods-use-this": "off",
......
......@@ -37,7 +37,7 @@ check@1.2.5
# package for user
accounts-base@1.3.0
accounts-password@1.3.6
accounts-password
alanning:roles
practicalmeteor:chai
......
accounts-base@1.3.0
accounts-password@1.3.6
accounts-password@1.3.7
ajduke:bootstrap-tagsinput@0.7.1
alanning:roles@1.2.16
aldeed:autoform@6.2.0
......@@ -67,7 +67,7 @@ modules-runtime@0.8.0
momentjs:moment@2.18.1
mongo@1.1.18
mongo-id@1.0.6
npm-bcrypt@0.9.2
npm-bcrypt@0.9.3
npm-mongo@2.2.24
observe-sequence@1.0.16
ordered-dict@1.0.9
......
import CrawFacebook from './crawlFacebook';
export default class crawlWebsite {
constructor(data) {
switch (data.type) {
case 'facebook':
return new CrawFacebook(data.tokenPage);
case 'twitter':
break;
case 'instagram':
break;
default:
throw new Meteor.Error('Error', 'Aucun type pour l\'API indiqué');
}
}
}
import { Meteor } from 'meteor/meteor';
import FB from 'fb';
import _ from 'lodash';
import checkData from '../../utils/checkData';
export default class CrawlFacebook {
constructor(token) {
this.listDataForIndex = [];
console.log('crawl facebook! ');
FB.setAccessToken(token);
return this.start();
}
start() {
return new Promise((resolve, reject) => {
FB.api('me/feed', (res) => {
if (!res || res.error) {
reject(new Meteor.Error('Error', 'Token invalide!'));
}
if (res.data && res.data.length) {
this.parseData(res.data)
.then((listDataForIndex) => {
resolve(listDataForIndex);
});
} else {
reject(new Meteor.Error('Error', 'Aucune publication Facebook!'));
}
});
});
}
parseData(data) {
return new Promise((resolve) => {
console.log(data);
_.forEach(data, (item, index) => {
const dataForIndex = {
tag: 'social',
url: `https://www.facebook.com/${item.id}`,
description: '',
};
dataForIndex.urlText = checkData.slugText(checkData.cleanText(dataForIndex.url));
if (item.name) {
dataForIndex.title = checkData.cleanText(item.name);
dataForIndex.title_suggest = checkData.cleanText(item.name);
}
if (item.message) {
dataForIndex.description += checkData.cleanText(item.message) + ' ';
dataForIndex.body += checkData.cleanText(item.message) + ' ';
}
if (item.description) {
dataForIndex.description += checkData.cleanText(item.description) + ' ';
dataForIndex.body += checkData.cleanText(item.description) + ' ';
}
if (item.story) {
dataForIndex.description += checkData.cleanText(item.story);
dataForIndex.body += checkData.cleanText(item.story);
}
if (item.created_time) {
dataForIndex.createdAt = item.created_time;
}
this.listDataForIndex.push({
index: {
_index: Meteor.settings.private.elasticsearch.esIndex,
_type: Meteor.settings.private.elasticsearch.esType,
_id: dataForIndex.url,
},
});
this.listDataForIndex.push(dataForIndex);
if (index + 1 === data.length) {
resolve(this.listDataForIndex);
}
});
});
}
}
import url from 'url';
import { Meteor } from 'meteor/meteor';
import Crawler from 'simplecrawler';
import _ from 'lodash';
import cheerio from 'cheerio';
import Crawler from 'crawler';
import Sitemapper from 'sitemapper';
import checkData from '../../utils/checkData';
export default class crawlWebsite {
constructor(url) {
this.urls = [];
this.crawler = new Crawler(url);
let i = 0;
/**
* crawl list urls
* @param urlWebsite
* @returns {Promise}
*/
constructor(urlWebsite) {
this.urlWebsite = urlWebsite;
this.config = {
domain: url.parse(urlWebsite).hostname,
forbiddenWord: [],
};
try {
const syncFunc = Meteor.wrapAsync(Meteor.call);
this.crawler.on('fetchcomplete', (queueItem) => {
const url = queueItem.url;
console.log(i++);
if (checkData.checkURL(url)) {
this.urls.push(url);
this.config = syncFunc('getConfig');
} catch (err) {
console.error(err);
}
console.log(this.config);
return this.start();
}
async start() {
// init sitemapper
const sitemap = new Sitemapper();
let urls = [this.urlWebsite];
// si c'est un sitemap on récupère ces url
if (checkData.isSitemap(this.urlWebsite)) {
const { sites } = await sitemap.fetch(this.urlWebsite);
urls = sites;
}
return new Promise((resolve, reject) => {
if (urls && urls.length === 0) {
reject(new Meteor.Error('indexation', 'Aucune url fourni!'));
}
this.listDataError = [];
this.listDataForIndex = [];
this.listUrlAlreadyVisited = urls;
console.log(`${urls.length} à parser!`);
const crawl = new Crawler({
skipDuplicates: true,
callback: (error, res, done) => {
if (error || res.statusCode !== 200) {
this.listDataError.push({
url: res.options.uri,
error: error || res.statusCode,
});
} else if (res && res.$) {
const listPdf = [];
// get inside links
res.$('a').each((index, a) => {
const urlHref = res.$(a).attr('href');
if (urlHref) {
const toQueueUrl = url.resolve(res.request.uri.href, res.$(a).attr('href'));
// check if same domain name
if (toQueueUrl.includes(this.config.domain)) {
// clean url
// toQueueUrl = checkData.cleanUrl(toQueueUrl);
// check if already visited
if (!this.listUrlAlreadyVisited.includes(toQueueUrl)) {
// check if url has forbidden word
if (!new RegExp(this.config.forbiddenWord.join('|')).test(toQueueUrl)) {
// check if url is good for crawl
if (checkData.checkCrawlUrl(toQueueUrl)) {
// add url in already visited urls
this.listUrlAlreadyVisited.push(toQueueUrl);
// check if url is pdf
if (toQueueUrl.match(/(pdf)/)) {
listPdf.push(toQueueUrl);
} else {
console.log(toQueueUrl);
crawl.queue(toQueueUrl);
}
}
}
}
}
}
});
// if url has not parameter => parse data
if (!res.options.uri.match(/(#.*|\?.*)/g)) {
this.parseData(res.$, res.options.uri, listPdf);
}
}
done();
},
});
this.crawler.on('complete', () => {
console.log(`${this.urls.length} parsé`);
crawl.queue(urls);
crawl.on('drain', () => {
console.error(this.listDataError);
resolve(this.listDataForIndex);
});
});
}
/**
* map data
* @param $
* @param currentUrl
* @param listPdf
*/
parseData($, currentUrl, listPdf) {
const _this = this;
const body = $('body');
body.html(checkData.cleanHtml(body.html()));
const title = checkData.cleanText($('title').text());
const dataForIndex = {
tag: 'site',
title,
title_suggest: {
input: title,
},
description: $('meta[name=description]').attr('content'),
body: checkData.slugText(checkData.cleanText(body.text())),
html: checkData.cleanText(body.html()),
urlText: checkData.slugText(checkData.cleanText(decodeURI(currentUrl))),
url: decodeURI(currentUrl),
createdAt: new Date(),
};
this.crawler.maxDepth = 3;
this.crawler.start();
if ($(_this.config.breadcrumb).text().length) {
dataForIndex.breadcrumb = checkData.slugText(checkData.cleanText($(_this.config.breadcrumb).text()));
}
parseData($, url) {
if ($('h1').text().length) {
dataForIndex.h1 = checkData.slugText(checkData.cleanText($('h1').text()));
}
if ($('h2').text().length) {
dataForIndex.h2 = checkData.slugText(checkData.cleanText($('h2').text()));
}
if (listPdf && listPdf.length) {
dataForIndex.listPdf = listPdf.join(' ');
}
this.listDataForIndex.push({
index: {
_index: Meteor.settings.private.elasticsearch.esIndex,
_type: Meteor.settings.private.elasticsearch.esType,
_id: currentUrl,
},
});
this.listDataForIndex.push(dataForIndex);
}
}
import IndexGeneric from './indexGeneric';
import CrawlApi from '../crawl/crawlApi';
export default class IndexApi extends IndexGeneric {
async indexationApi(data) {
return new CrawlApi(data)
.then(dataToIndex => this.indexByBulk(dataToIndex))
.catch((error) => {
throw error;
});
}
}
import url from 'url';
import { Meteor } from 'meteor/meteor';
import Sitemapper from 'sitemapper';
import Crawler from 'crawler';
import checkData from '../../utils/checkData';
import IndexGeneric from './indexGeneric';
import CrawlWebsite from '../crawl/crawlWebsite';
const sitemap = new Sitemapper();
export default class IndexWebsite extends IndexGeneric {
/**
......@@ -16,158 +9,11 @@ export default class IndexWebsite extends IndexGeneric {
* @param urlWebsite
* @returns Promise
*/
async initIndexation(urlWebsite) {
this.config = {
domain: url.parse(urlWebsite).hostname,
forbiddenWord: [],
};
try {
const syncFunc = Meteor.wrapAsync(Meteor.call);
this.config = syncFunc('getConfig');
} catch (err) {
console.error(err);
}
console.log(this.config);
let urls = [urlWebsite];
// si c'est un sitemap on récupère ces url
if (checkData.isSitemap(urlWebsite)) {
const { sites } = await sitemap.fetch(urlWebsite);
urls = sites;
}
return this.crawlUrl(_.uniq(urls))
async indexationWebsite(urlWebsite) {
return new CrawlWebsite(urlWebsite)
.then(dataToIndex => this.indexByBulk(dataToIndex))
.catch((error) => {
throw error;
});
}
/**
* crawl list urls
* @param urls
* @returns {Promise}
*/
crawlUrl(urls) {
return new Promise((resolve, reject) => {
if (urls && urls.length === 0) {
reject(new Meteor.Error('indexation', 'Aucune url fourni!'));
}
this.listDataError = [];
this.listDataForIndex = [];
this.listUrlAlreadyVisited = urls;
console.log(`${urls.length} à parser!`);
const crawl = new Crawler({
skipDuplicates: true,
callback: (error, res, done) => {
if (error || res.statusCode !== 200) {
this.listDataError.push({
url: res.options.uri,
error: error || res.statusCode,
});
} else if (res && res.$) {
const listPdf = [];
// get inside links
res.$('a').each((index, a) => {
const urlHref = res.$(a).attr('href');
if (urlHref) {
const toQueueUrl = url.resolve(res.request.uri.href, res.$(a).attr('href'));
// check if same domain name
if (toQueueUrl.includes(this.config.domain)) {
// clean url
// toQueueUrl = checkData.cleanUrl(toQueueUrl);
// check if already visited
if (!this.listUrlAlreadyVisited.includes(toQueueUrl)) {
// check if url has forbidden word
if (!new RegExp(this.config.forbiddenWord.join('|')).test(toQueueUrl)) {
// check if url is good for crawl
if (checkData.checkCrawlUrl(toQueueUrl)) {
// add url in already visited urls
this.listUrlAlreadyVisited.push(toQueueUrl);
// check if url is pdf
if (toQueueUrl.match(/(pdf)/)) {
listPdf.push(toQueueUrl);
} else {
console.log(toQueueUrl);
crawl.queue(toQueueUrl);
}
}
}
}
}
}
});
// if url has not parameter => parse data
if (!res.options.uri.match(/(#.*|\?.*)/g)) {
this.parseData(res.$, res.options.uri, listPdf);
}
}
done();
},
});
crawl.queue(urls);
crawl.on('drain', () => {
console.error(this.listDataError);
resolve(this.listDataForIndex);
});
});
}
/**
* map data
* @param $
* @param currentUrl
*/
parseData($, currentUrl, listPdf) {
const _this = this;
const body = $('body');
body.html(checkData.cleanHtml(body.html()));
const title = checkData.cleanText($('title').text());
const dataForIndex = {
tag: 'site',
title,
title_suggest: {
input: title,
},
description: $('meta[name=description]').attr('content'),
body: checkData.slugText(checkData.cleanText(body.text())),
html: checkData.cleanText(body.html()),
urlText: checkData.slugText(checkData.cleanText(decodeURI(currentUrl))),
url: decodeURI(currentUrl),
createdAt: new Date(),
};
if ($(_this.config.breadcrumb).text().length) {
dataForIndex.breadcrumb = checkData.slugText(checkData.cleanText($(_this.config.breadcrumb).text()));
}
if ($('h1').text().length) {
dataForIndex.h1 = checkData.slugText(checkData.cleanText($('h1').text()));
}
if ($('h2').text().length) {
dataForIndex.h2 = checkData.slugText(checkData.cleanText($('h2').text()));
}
if (listPdf && listPdf.length) {
dataForIndex.listPdf = listPdf.join(' ');
}
this.listDataForIndex.push({
index: {
_index: Meteor.settings.private.elasticsearch.esIndex,
_type: Meteor.settings.private.elasticsearch.esType,
_id: currentUrl,
},
});
this.listDataForIndex.push(dataForIndex);
}
}
......@@ -2,22 +2,34 @@
import { check } from 'meteor/check';
import { Meteor } from 'meteor/meteor';
import _ from 'lodash';
import { formApiCrawlSchema } from '/imports/collections/schemaForm';
import IndexGeneric from './indexGeneric';
import IndexWebsite from './indexWebsite';
import IndexApi from './indexApi';
Meteor.methods({
initIndexElastic() {
this.unblock();
const index = new IndexGeneric();
return index.initElastic();
},
reIndexElastic() {
this.unblock();
const index = new IndexGeneric();
return index.reIndex();
},
indexWebsite(url) {
check(url, String);
this.unblock();
const index = new IndexWebsite();
return index.initIndexation(url);
return index.indexationWebsite(url);
},
indexApi(data) {
check(data, Object);
formApiCrawlSchema.validate(data);
this.unblock();
const index = new IndexApi();
return index.indexationApi(data);
},
searchByTerm(term) {
check(term, String);
......
......@@ -2,6 +2,12 @@
import SimpleSchema from 'simpl-schema';
import { Mongo } from 'meteor/mongo';
if (Meteor.isClient) {
import './hooksForm';
}
SimpleSchema.extendOptions(['autoform']);
/**
* this collection is for config (crawl, api, index)
* @type {Mongo.Collection}
......
import SimpleSchema from 'simpl-schema';
/**
* Custom error messages
*/
SimpleSchema.setDefaultMessages({
messages: {
en: {
supMax: 'Cette valeur est supérieure au maximum.',
maxInfMin: 'La valeur maximale doit être supérieure à la valeur minimale.',
startInfEnd: 'La date de début doit être inférieure à la date de fin.',
endInfStart: 'La date de fin doit être supérieure à la date de début.',
usernameExists: 'Ce login est déjà utilisé. Veuillez en choisir un autre',
noMatchingPassword: 'les champs liés au mot de passe ne correspondent pas.',
companyHasAlreadySubmittedForThisBatch: 'Cette entreprise participe déjà à une réponse pour ce lot.',
// native messages
required: 'Veuillez saisir ou sélectionner une valeur pour le champ [label]',
minString: 'Veuillez saisir au moins [min] caractères',
maxString: 'Veuillez saisir moins de [max] caractères',
minNumber: 'Ce champ doit être superieur ou égal à [min]',
maxNumber: 'Ce champ doit être inferieur ou égal à [max]',
minNumberExclusive: 'Ce champ doit être superieur à [min]',
maxNumberExclusive: 'Ce champ doit être inferieur à [max]',
minDate: 'La date doit est postérieure au [min]',
maxDate: 'La date doit est antérieure au [max]',
badDate: 'Cette date est invalide',
minCount: 'Vous devez saisir plus de [minCount] valeurs',
maxCount: 'Vous devez saisir moins de [maxCount] valeurs',
noDecimal: 'Ce champ doit être un entier',
notAllowed: "[value] n'est pas une valeur acceptée",
expectedString: 'Le champ [label] doit être une chaine de caractères',
expectedNumber: 'Le champ [label] doit être un nombre',
expectedBoolean: 'Le champ [label] doit être un booléen',
expectedArray: 'Le champ [label] doit être un tableau',
expectedObject: 'Le champ [label] doit être une objet',
expectedConstructor: 'Le champ [label] doit être du type [type]',
keyNotInSchema: "Le champ [key] n'est pas permis par le schéma",
regEx: [
{ msg: 'Le champ [label] ne vérifie pas la validation par Regex' },
{ exp: SimpleSchema.RegEx.Email, msg: 'Cette adresse e-mail est incorrecte' },
{ exp: SimpleSchema.RegEx.WeakEmail, msg: 'Cette adresse e-mail est incorrecte' },
{ exp: SimpleSchema.RegEx.Domain, msg: 'Le champ [label] doit être un domaine valide' },
{ exp: SimpleSchema.RegEx.WeakDomain, msg: 'Le champ [label] doit être un domaine valide' },
{ exp: SimpleSchema.RegEx.IP, msg: 'Cette adresse IP est invalide' },
{ exp: SimpleSchema.RegEx.IPv4, msg: 'Cette adresse IPv4 est invalide' },
{ exp: SimpleSchema.RegEx.IPv6, msg: 'Cette adresse IPv6 est invalide' },
{ exp: SimpleSchema.RegEx.Url, msg: 'Cette URL is invalide' },
{ exp: SimpleSchema.RegEx.Id, msg: 'Cet identifiant alphanumérique est invalide' },
{ exp: SimpleSchema.RegEx.Phone, msg: 'Ce numéro de téléphone est invalide' },
{ exp: SimpleSchema.RegEx.Telephone, msg: 'Ce numéro de téléphone est invalide' },
],
},
},
});
import SimpleSchema from 'simpl-schema';
SimpleSchema.extendOptions(['autoform']);
export default new SimpleSchema({
tokenPage: {
type: String,
label: 'Token de la page',
},
type: {
type: String,
label: 'Type d\'API',
autoform: {
type: 'select-radio-inline',
options() {
return [
{
label: 'Facebook', value: 'facebook',
},
{
label: 'Twitter', value: 'twitter',
},
{
label: 'Instagram', value: 'instagram',
},
];
},
},
},
}, { tracker: Tracker });
import SimpleSchema from 'simpl-schema';
SimpleSchema.extendOptions(['autoform']);
export default new SimpleSchema({
login: {
type: String,
label: 'Login',
},
password: {
type: String,
label: 'Mot de passe',
},
}, { tracker: Tracker });
import displayNotif from './../../ui/components/notifs/notifs.js';
export default {
onSuccess(formType, result) {
displayNotif({
type: 'success',
title: 'Indexation : ',
message: 'Indexation fini avec succès!',
save: true,
});
},
onError(formType, error) {
displayNotif({
type: 'error',
title: 'Indexation : ',
message: error.reason ? error.reason : error,
save: true,
});
},
};
import { Meteor } from 'meteor/meteor';
import { FlowRouter } from 'meteor/kadira:flow-router';
import displayNotif from './../../ui/components/notifs/notifs.js';
export default {
onSubmit: (insertDoc, updateDoc, currentDoc) => {
Meteor.loginWithPassword(insertDoc.login, insertDoc.password, (err) => {