Commit 43d77d05 authored by Nacim Goura's avatar Nacim Goura

update crawl website

parent 6a4282a8
...@@ -42,7 +42,7 @@ export default class CrawlFacebook { ...@@ -42,7 +42,7 @@ export default class CrawlFacebook {
}; };
dataForIndex.urlText = checkData.slugText(checkData.cleanText(dataForIndex.url)); dataForIndex.urlText = checkData.cleanText(dataForIndex.url);
if (item.name) { if (item.name) {
dataForIndex.title = checkData.cleanText(item.name); dataForIndex.title = checkData.cleanText(item.name);
......
...@@ -50,7 +50,7 @@ export default class CrawlTwitter { ...@@ -50,7 +50,7 @@ export default class CrawlTwitter {
}; };
dataForIndex.urlText = checkData.slugText(checkData.cleanText(dataForIndex.url)); dataForIndex.urlText = checkData.cleanText(dataForIndex.url);
if (item.text) { if (item.text) {
item.text = item.text.replace(/^(?:https?:)?\/\//, ''); item.text = item.text.replace(/^(?:https?:)?\/\//, '');
......
...@@ -48,6 +48,7 @@ export default class crawlWebsite extends CrawlGeneric { ...@@ -48,6 +48,7 @@ export default class crawlWebsite extends CrawlGeneric {
const crawl = new Crawler({ const crawl = new Crawler({
skipDuplicates: true, skipDuplicates: true,
userAgent: 'Mozilla/5.0 (compatible; fr-crawler/1.1)',
callback: (error, res, done) => { callback: (error, res, done) => {
if (error || res.statusCode !== 200) { if (error || res.statusCode !== 200) {
this.listDataError.push({ this.listDataError.push({
...@@ -132,23 +133,23 @@ export default class crawlWebsite extends CrawlGeneric { ...@@ -132,23 +133,23 @@ export default class crawlWebsite extends CrawlGeneric {
input: title, input: title,
}, },
description: $('meta[name=description]').attr('content'), description: $('meta[name=description]').attr('content'),
body: checkData.slugText(checkData.cleanText(body.text())), body: checkData.cleanText(body.text()),
html: checkData.cleanText(body.html()), html: body.html(),
urlText: checkData.slugText(checkData.cleanText(decodeURI(currentUrl))), urlText: checkData.cleanText(decodeURI(currentUrl)),
url: decodeURI(currentUrl), url: decodeURI(currentUrl),
createdAt: new Date(), createdAt: new Date(),
}; };
if ($(this.config.crawl.breadcrumb).text().length) { if ($(this.config.crawl.breadcrumb).text().length) {
dataForIndex.breadcrumb = checkData.slugText(checkData.cleanText($(this.config.crawl.breadcrumb).text())); dataForIndex.breadcrumb = checkData.cleanText($(this.config.crawl.breadcrumb).text());
} }
if ($('h1').text().length) { if ($('h1').text().length) {
dataForIndex.h1 = checkData.slugText(checkData.cleanText($('h1').text())); dataForIndex.h1 = checkData.cleanText($('h1').text());
} }
if ($('h2').text().length) { if ($('h2').text().length) {
dataForIndex.h2 = checkData.slugText(checkData.cleanText($('h2').text())); dataForIndex.h2 = checkData.cleanText($('h2').text());
} }
if (listPdf && listPdf.length) { if (listPdf && listPdf.length) {
......
...@@ -11,6 +11,7 @@ export async function searchWebsite(data, userId) { ...@@ -11,6 +11,7 @@ export async function searchWebsite(data, userId) {
const search = new Search(userId); const search = new Search(userId);
try { try {
const results = await search.searchWebsite(data.searchTerm); const results = await search.searchWebsite(data.searchTerm);
console.log(results);
return { return {
total: results.hits.total, total: results.hits.total,
list: _.map(results.hits.hits, '_source'), list: _.map(results.hits.hits, '_source'),
......
...@@ -33,8 +33,8 @@ export default class Search { ...@@ -33,8 +33,8 @@ export default class Search {
fields: [ fields: [
'description.stemmed', 'description.stemmed',
'body.stemmed', 'body.stemmed',
'urlText.stemmed',
'title.stemmed', 'title.stemmed',
'html',
], ],
}, },
}, },
...@@ -44,16 +44,19 @@ export default class Search { ...@@ -44,16 +44,19 @@ export default class Search {
query: term, query: term,
fuzziness: 'AUTO', fuzziness: 'AUTO',
fields: [ fields: [
'body.stemmed',
'description', 'description',
'description.stemmed',
'urlText', 'urlText',
'urlText.stemmed',
'body', 'body',
'title', 'title',
'url', 'url',
'breadcrumb', 'breadcrumb',
'h1', 'h1',
'h2', 'h2',
'html',
'url', 'url',
'html',
], ],
}, },
}], }],
......
...@@ -14,7 +14,7 @@ ...@@ -14,7 +14,7 @@
<div class="panel-body"> <div class="panel-body">
{{#each result in websiteResults.list}} {{#each result in websiteResults.list}}
<li> <li>
<a href="{{result.url}}" target="_blank">{{result.title}}</a> <a href="{{result.url}}" target="_blank">{{result.title}} ({{result.url}})</a>
</li> </li>
{{/each}} {{/each}}
</div> </div>
......
...@@ -10,3 +10,7 @@ ...@@ -10,3 +10,7 @@
.error, .stopped { .error, .stopped {
background-color: #E74C3C; background-color: #E74C3C;
} }
.iziToast-wrapper {
z-index: 1000;
}
...@@ -2,18 +2,8 @@ ...@@ -2,18 +2,8 @@
import SimpleSchema from 'simpl-schema'; import SimpleSchema from 'simpl-schema';
import sanitizeHtml from 'sanitize-html'; import sanitizeHtml from 'sanitize-html';
import _ from 'lodash'; import _ from 'lodash';
import slug from 'slug'; import detergent from 'detergent';
import unfancy from 'string-unfancy';
slug.defaults.modes.pretty = {
replacement: ' ',
symbols: true,
remove: null,
lower: true,
charmap: slug.charmap,
multicharmap: {
'&&': 'et', '||': 'ou',
},
};
const checkData = { const checkData = {
...@@ -23,21 +13,32 @@ const checkData = { ...@@ -23,21 +13,32 @@ const checkData = {
}, },
isSitemap(str) { isSitemap(str) {
return !!(this.isUrl(str) && _.includes(str, 'sitemap')); return !!(this.isUrl(str) && _.includes(['xml', 'txt'], _.last(_.split(str, '.'))) && _.includes(str, 'sitemap'));
}, },
cleanText(str) { cleanText(str) {
// enleve saut de ligne et slash // enleve saut de ligne et slash
let cleanStr = _.replace(str, /[\n\\/]/gm, ' '); // let cleanStr = _.replace(str, /[\n\\/]/gm, ' ');
// met un espace avant une majuscule // met un espace avant une majuscule
// cleanStr = _.replace(cleanStr, /([A-Z])/gm, ' $1'); // cleanStr = _.replace(cleanStr, /([A-Z])/gm, ' $1');
// enleve les multiples espace // enleve les multiples espace
cleanStr = _.replace(cleanStr, /[^\S]{2,}/gm, ' '); // cleanStr = _.replace(cleanStr, /[^\S]{2,}/gm, ' ');
return _.trim(cleanStr); let resultText = detergent(str, {
}, removeWidows: false, // replace the last space in paragraph with &nbsp;
convertEntities: false, // encode all non-ASCII chars
slugText(str) { convertDashes: false, // typographically-correct the n/m-dashes
return slug(str); convertApostrophes: false, // typographically-correct the apostrophes
replaceLineBreaks: false, // replace all line breaks with BR's
removeLineBreaks: true, // put everything on one line
useXHTML: false, // add closing slashes on BR's
removeSoftHyphens: true, // remove character which encodes to &#173; or &shy;
dontEncodeNonLatin: false, // skip non-latin character encoding
keepBoldEtc: false, // any bold, strong, i or em tags are stripped of attributes and retained
addMissingSpaces: true, // adds missing spaces after dots/colons/semicolons, unless it's URL
});
resultText = resultText.replace(/[Æ]/g, 'ae'); // .replace(/[\n\\/]/gm, ' ').replace(/[<br>]/gm, ' ').replace(/[^\S]{2,}/gm, ' ');
resultText = resultText.replace(/([a-z])([A-Z])/g, '$1 $2');
return unfancy(resultText);
}, },
cleanHtml(html) { cleanHtml(html) {
...@@ -53,6 +54,9 @@ const checkData = { ...@@ -53,6 +54,9 @@ const checkData = {
allowedSchemes: ['http', 'https', 'ftp', 'mailto'], allowedSchemes: ['http', 'https', 'ftp', 'mailto'],
allowedSchemesByTag: {}, allowedSchemesByTag: {},
allowProtocolRelative: true, allowProtocolRelative: true,
transformTags: {
li: ' ',
},
}); });
}, },
......
import { chai } from 'meteor/practicalmeteor:chai'; import { chai } from 'meteor/practicalmeteor:chai';
import checkData from './checkData.js'; import checkData from './checkData.js';
import unfancy from 'string-unfancy';
describe('CheckData Test', function () { describe('CheckData Test', () => {
it('test if string is a url', function () { it('test if string is a url', () => {
const url1 = 'https://guide.meteor.com/testing.html'; const url1 = 'https://guide.meteor.com/testing.html';
const url2 = 'https://www.yakaferci.com/genere-sitemap/'; const url2 = 'https://www.yakaferci.com/genere-sitemap/';
const url3 = 'https://atmospherejs.com/?q=promise'; const url3 = 'https://atmospherejs.com/?q=promise';
...@@ -18,8 +19,8 @@ describe('CheckData Test', function () { ...@@ -18,8 +19,8 @@ describe('CheckData Test', function () {
}); });
}); });
describe('CheckData Test', function () { describe('CheckData Test', () => {
it('test if url site a sitemap', function () { it('test if url site a sitemap', () => {
const url1 = 'http://wp.seantburke.com/sitemap.xml'; const url1 = 'http://wp.seantburke.com/sitemap.xml';
const url2 = 'http://wp.seantburke.com/sitemap.txt'; const url2 = 'http://wp.seantburke.com/sitemap.txt';
const url3 = 'https://atmospherejs.com/?q=promise'; const url3 = 'https://atmospherejs.com/?q=promise';
...@@ -31,3 +32,12 @@ describe('CheckData Test', function () { ...@@ -31,3 +32,12 @@ describe('CheckData Test', function () {
chai.assert.equal(checkData.isSitemap(url4), false); chai.assert.equal(checkData.isSitemap(url4), false);
}); });
}); });
describe('CheckData Test', () => {
it('clean text', () => {
const text = 'directeur de l&#x2019;Agence Nationale de l&#x2019;Evaluation et de la qualit&#xE9; des &#xE9;tablissements et Services sociaux';
const result = checkData.cleanText(text);
chai.assert.isString(result);
chai.expect(result).to.equal('directeur de l\'Agence Nationale de l\'Evaluation et de la qualité des établissements et Services sociaux');
});
});
// Server entry point, imports all server code // Server entry point, imports all server code
import '../imports/startup/server/index.js'; import '/imports/startup/server/index.js';
#!/usr/bin/env bash
~/.nvm/nvm.sh use && cd ./.deploy/ && pm2-meteor deploy
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment