Commit 43d77d05 authored by Nacim Goura's avatar Nacim Goura

update crawl website

parent 6a4282a8
......@@ -42,7 +42,7 @@ export default class CrawlFacebook {
};
dataForIndex.urlText = checkData.slugText(checkData.cleanText(dataForIndex.url));
dataForIndex.urlText = checkData.cleanText(dataForIndex.url);
if (item.name) {
dataForIndex.title = checkData.cleanText(item.name);
......
......@@ -50,7 +50,7 @@ export default class CrawlTwitter {
};
dataForIndex.urlText = checkData.slugText(checkData.cleanText(dataForIndex.url));
dataForIndex.urlText = checkData.cleanText(dataForIndex.url);
if (item.text) {
item.text = item.text.replace(/^(?:https?:)?\/\//, '');
......
......@@ -48,6 +48,7 @@ export default class crawlWebsite extends CrawlGeneric {
const crawl = new Crawler({
skipDuplicates: true,
userAgent: 'Mozilla/5.0 (compatible; fr-crawler/1.1)',
callback: (error, res, done) => {
if (error || res.statusCode !== 200) {
this.listDataError.push({
......@@ -132,23 +133,23 @@ export default class crawlWebsite extends CrawlGeneric {
input: title,
},
description: $('meta[name=description]').attr('content'),
body: checkData.slugText(checkData.cleanText(body.text())),
html: checkData.cleanText(body.html()),
urlText: checkData.slugText(checkData.cleanText(decodeURI(currentUrl))),
body: checkData.cleanText(body.text()),
html: body.html(),
urlText: checkData.cleanText(decodeURI(currentUrl)),
url: decodeURI(currentUrl),
createdAt: new Date(),
};
if ($(this.config.crawl.breadcrumb).text().length) {
dataForIndex.breadcrumb = checkData.slugText(checkData.cleanText($(this.config.crawl.breadcrumb).text()));
dataForIndex.breadcrumb = checkData.cleanText($(this.config.crawl.breadcrumb).text());
}
if ($('h1').text().length) {
dataForIndex.h1 = checkData.slugText(checkData.cleanText($('h1').text()));
dataForIndex.h1 = checkData.cleanText($('h1').text());
}
if ($('h2').text().length) {
dataForIndex.h2 = checkData.slugText(checkData.cleanText($('h2').text()));
dataForIndex.h2 = checkData.cleanText($('h2').text());
}
if (listPdf && listPdf.length) {
......
......@@ -11,6 +11,7 @@ export async function searchWebsite(data, userId) {
const search = new Search(userId);
try {
const results = await search.searchWebsite(data.searchTerm);
console.log(results);
return {
total: results.hits.total,
list: _.map(results.hits.hits, '_source'),
......
......@@ -33,8 +33,8 @@ export default class Search {
fields: [
'description.stemmed',
'body.stemmed',
'urlText.stemmed',
'title.stemmed',
'html',
],
},
},
......@@ -44,16 +44,19 @@ export default class Search {
query: term,
fuzziness: 'AUTO',
fields: [
'body.stemmed',
'description',
'description.stemmed',
'urlText',
'urlText.stemmed',
'body',
'title',
'url',
'breadcrumb',
'h1',
'h2',
'html',
'url',
'html',
],
},
}],
......
......@@ -14,7 +14,7 @@
<div class="panel-body">
{{#each result in websiteResults.list}}
<li>
<a href="{{result.url}}" target="_blank">{{result.title}}</a>
<a href="{{result.url}}" target="_blank">{{result.title}} ({{result.url}})</a>
</li>
{{/each}}
</div>
......
......@@ -10,3 +10,7 @@
.error, .stopped {
background-color: #E74C3C;
}
.iziToast-wrapper {
z-index: 1000;
}
......@@ -2,18 +2,8 @@
import SimpleSchema from 'simpl-schema';
import sanitizeHtml from 'sanitize-html';
import _ from 'lodash';
import slug from 'slug';
slug.defaults.modes.pretty = {
replacement: ' ',
symbols: true,
remove: null,
lower: true,
charmap: slug.charmap,
multicharmap: {
'&&': 'et', '||': 'ou',
},
};
import detergent from 'detergent';
import unfancy from 'string-unfancy';
const checkData = {
......@@ -23,21 +13,32 @@ const checkData = {
},
isSitemap(str) {
return !!(this.isUrl(str) && _.includes(str, 'sitemap'));
return !!(this.isUrl(str) && _.includes(['xml', 'txt'], _.last(_.split(str, '.'))) && _.includes(str, 'sitemap'));
},
cleanText(str) {
// enleve saut de ligne et slash
let cleanStr = _.replace(str, /[\n\\/]/gm, ' ');
// let cleanStr = _.replace(str, /[\n\\/]/gm, ' ');
// met un espace avant une majuscule
// cleanStr = _.replace(cleanStr, /([A-Z])/gm, ' $1');
// enleve les multiples espace
cleanStr = _.replace(cleanStr, /[^\S]{2,}/gm, ' ');
return _.trim(cleanStr);
},
slugText(str) {
return slug(str);
// cleanStr = _.replace(cleanStr, /[^\S]{2,}/gm, ' ');
let resultText = detergent(str, {
removeWidows: false, // replace the last space in paragraph with &nbsp;
convertEntities: false, // encode all non-ASCII chars
convertDashes: false, // typographically-correct the n/m-dashes
convertApostrophes: false, // typographically-correct the apostrophes
replaceLineBreaks: false, // replace all line breaks with BR's
removeLineBreaks: true, // put everything on one line
useXHTML: false, // add closing slashes on BR's
removeSoftHyphens: true, // remove character which encodes to &#173; or &shy;
dontEncodeNonLatin: false, // skip non-latin character encoding
keepBoldEtc: false, // any bold, strong, i or em tags are stripped of attributes and retained
addMissingSpaces: true, // adds missing spaces after dots/colons/semicolons, unless it's URL
});
resultText = resultText.replace(/[Æ]/g, 'ae'); // .replace(/[\n\\/]/gm, ' ').replace(/[<br>]/gm, ' ').replace(/[^\S]{2,}/gm, ' ');
resultText = resultText.replace(/([a-z])([A-Z])/g, '$1 $2');
return unfancy(resultText);
},
cleanHtml(html) {
......@@ -53,6 +54,9 @@ const checkData = {
allowedSchemes: ['http', 'https', 'ftp', 'mailto'],
allowedSchemesByTag: {},
allowProtocolRelative: true,
transformTags: {
li: ' ',
},
});
},
......
import { chai } from 'meteor/practicalmeteor:chai';
import checkData from './checkData.js';
import unfancy from 'string-unfancy';
describe('CheckData Test', function () {
it('test if string is a url', function () {
describe('CheckData Test', () => {
it('test if string is a url', () => {
const url1 = 'https://guide.meteor.com/testing.html';
const url2 = 'https://www.yakaferci.com/genere-sitemap/';
const url3 = 'https://atmospherejs.com/?q=promise';
......@@ -18,8 +19,8 @@ describe('CheckData Test', function () {
});
});
describe('CheckData Test', function () {
it('test if url site a sitemap', function () {
describe('CheckData Test', () => {
it('test if url site a sitemap', () => {
const url1 = 'http://wp.seantburke.com/sitemap.xml';
const url2 = 'http://wp.seantburke.com/sitemap.txt';
const url3 = 'https://atmospherejs.com/?q=promise';
......@@ -31,3 +32,12 @@ describe('CheckData Test', function () {
chai.assert.equal(checkData.isSitemap(url4), false);
});
});
describe('CheckData Test', () => {
it('clean text', () => {
const text = 'directeur de l&#x2019;Agence Nationale de l&#x2019;Evaluation et de la qualit&#xE9; des &#xE9;tablissements et Services sociaux';
const result = checkData.cleanText(text);
chai.assert.isString(result);
chai.expect(result).to.equal('directeur de l\'Agence Nationale de l\'Evaluation et de la qualité des établissements et Services sociaux');
});
});
// Server entry point, imports all server code
import '../imports/startup/server/index.js';
import '/imports/startup/server/index.js';
#!/usr/bin/env bash
~/.nvm/nvm.sh use && cd ./.deploy/ && pm2-meteor deploy
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment