Commit 43d77d05 authored by Nacim Goura's avatar Nacim Goura

update crawl website

parent 6a4282a8
......@@ -42,7 +42,7 @@ export default class CrawlFacebook {
};
dataForIndex.urlText = checkData.slugText(checkData.cleanText(dataForIndex.url));
dataForIndex.urlText = checkData.cleanText(dataForIndex.url);
if (item.name) {
dataForIndex.title = checkData.cleanText(item.name);
......
......@@ -50,7 +50,7 @@ export default class CrawlTwitter {
};
dataForIndex.urlText = checkData.slugText(checkData.cleanText(dataForIndex.url));
dataForIndex.urlText = checkData.cleanText(dataForIndex.url);
if (item.text) {
item.text = item.text.replace(/^(?:https?:)?\/\//, '');
......
......@@ -48,6 +48,7 @@ export default class crawlWebsite extends CrawlGeneric {
const crawl = new Crawler({
skipDuplicates: true,
userAgent: 'Mozilla/5.0 (compatible; fr-crawler/1.1)',
callback: (error, res, done) => {
if (error || res.statusCode !== 200) {
this.listDataError.push({
......@@ -132,23 +133,23 @@ export default class crawlWebsite extends CrawlGeneric {
input: title,
},
description: $('meta[name=description]').attr('content'),
body: checkData.slugText(checkData.cleanText(body.text())),
html: checkData.cleanText(body.html()),
urlText: checkData.slugText(checkData.cleanText(decodeURI(currentUrl))),
body: checkData.cleanText(body.text()),
html: body.html(),
urlText: checkData.cleanText(decodeURI(currentUrl)),
url: decodeURI(currentUrl),
createdAt: new Date(),
};
if ($(this.config.crawl.breadcrumb).text().length) {
dataForIndex.breadcrumb = checkData.slugText(checkData.cleanText($(this.config.crawl.breadcrumb).text()));
dataForIndex.breadcrumb = checkData.cleanText($(this.config.crawl.breadcrumb).text());
}
if ($('h1').text().length) {
dataForIndex.h1 = checkData.slugText(checkData.cleanText($('h1').text()));
dataForIndex.h1 = checkData.cleanText($('h1').text());
}
if ($('h2').text().length) {
dataForIndex.h2 = checkData.slugText(checkData.cleanText($('h2').text()));
dataForIndex.h2 = checkData.cleanText($('h2').text());
}
if (listPdf && listPdf.length) {
......
......@@ -11,6 +11,7 @@ export async function searchWebsite(data, userId) {
const search = new Search(userId);
try {
const results = await search.searchWebsite(data.searchTerm);
console.log(results);
return {
total: results.hits.total,
list: _.map(results.hits.hits, '_source'),
......
......@@ -33,8 +33,8 @@ export default class Search {
fields: [
'description.stemmed',
'body.stemmed',
'urlText.stemmed',
'title.stemmed',
'html',
],
},
},
......@@ -44,16 +44,19 @@ export default class Search {
query: term,
fuzziness: 'AUTO',
fields: [
'body.stemmed',
'description',
'description.stemmed',
'urlText',
'urlText.stemmed',
'body',
'title',
'url',
'breadcrumb',
'h1',
'h2',
'html',
'url',
'html',
],
},
}],
......
......@@ -14,7 +14,7 @@
<div class="panel-body">
{{#each result in websiteResults.list}}
<li>
<a href="{{result.url}}" target="_blank">{{result.title}}</a>
<a href="{{result.url}}" target="_blank">{{result.title}} ({{result.url}})</a>
</li>
{{/each}}
</div>
......
......@@ -10,3 +10,7 @@
.error, .stopped {
background-color: #E74C3C;
}
.iziToast-wrapper {
z-index: 1000;
}
......@@ -2,18 +2,8 @@
import SimpleSchema from 'simpl-schema';
import sanitizeHtml from 'sanitize-html';
import _ from 'lodash';
import slug from 'slug';
slug.defaults.modes.pretty = {
replacement: ' ',
symbols: true,
remove: null,
lower: true,
charmap: slug.charmap,
multicharmap: {
'&&': 'et', '||': 'ou',
},
};
import detergent from 'detergent';
import unfancy from 'string-unfancy';
const checkData = {
......@@ -23,21 +13,32 @@ const checkData = {
},
isSitemap(str) {
return !!(this.isUrl(str) && _.includes(str, 'sitemap'));
return !!(this.isUrl(str) && _.includes(['xml', 'txt'], _.last(_.split(str, '.'))) && _.includes(str, 'sitemap'));
},
cleanText(str) {
// enleve saut de ligne et slash
let cleanStr = _.replace(str, /[\n\\/]/gm, ' ');
// let cleanStr = _.replace(str, /[\n\\/]/gm, ' ');
// met un espace avant une majuscule
// cleanStr = _.replace(cleanStr, /([A-Z])/gm, ' $1');
// enleve les multiples espace
cleanStr = _.replace(cleanStr, /[^\S]{2,}/gm, ' ');
return _.trim(cleanStr);
},
slugText(str) {
return slug(str);
// cleanStr = _.replace(cleanStr, /[^\S]{2,}/gm, ' ');
let resultText = detergent(str, {
removeWidows: false, // replace the last space in paragraph with &nbsp;
convertEntities: false, // encode all non-ASCII chars
convertDashes: false, // typographically-correct the n/m-dashes
convertApostrophes: false, // typographically-correct the apostrophes
replaceLineBreaks: false, // replace all line breaks with BR's
removeLineBreaks: true, // put everything on one line
useXHTML: false, // add closing slashes on BR's
removeSoftHyphens: true, // remove character which encodes to &#173; or &shy;
dontEncodeNonLatin: false, // skip non-latin character encoding
keepBoldEtc: false, // any bold, strong, i or em tags are stripped of attributes and retained
addMissingSpaces: true, // adds missing spaces after dots/colons/semicolons, unless it's URL
});
resultText = resultText.replace(/[Æ]/g, 'ae'); // .replace(/[\n\\/]/gm, ' ').replace(/[<br>]/gm, ' ').replace(/[^\S]{2,}/gm, ' ');
resultText = resultText.replace(/([a-z])([A-Z])/g, '$1 $2');
return unfancy(resultText);
},
cleanHtml(html) {
......@@ -53,6 +54,9 @@ const checkData = {
allowedSchemes: ['http', 'https', 'ftp', 'mailto'],
allowedSchemesByTag: {},
allowProtocolRelative: true,
transformTags: {
li: ' ',
},
});
},
......
import { chai } from 'meteor/practicalmeteor:chai';
import checkData from './checkData.js';
import unfancy from 'string-unfancy';
describe('CheckData Test', function () {
it('test if string is a url', function () {
describe('CheckData Test', () => {
it('test if string is a url', () => {
const url1 = 'https://guide.meteor.com/testing.html';
const url2 = 'https://www.yakaferci.com/genere-sitemap/';
const url3 = 'https://atmospherejs.com/?q=promise';
......@@ -18,8 +19,8 @@ describe('CheckData Test', function () {
});
});
describe('CheckData Test', function () {
it('test if url site a sitemap', function () {
describe('CheckData Test', () => {
it('test if url site a sitemap', () => {
const url1 = 'http://wp.seantburke.com/sitemap.xml';
const url2 = 'http://wp.seantburke.com/sitemap.txt';
const url3 = 'https://atmospherejs.com/?q=promise';
......@@ -31,3 +32,12 @@ describe('CheckData Test', function () {
chai.assert.equal(checkData.isSitemap(url4), false);
});
});
describe('CheckData Test', () => {
it('clean text', () => {
const text = 'directeur de l&#x2019;Agence Nationale de l&#x2019;Evaluation et de la qualit&#xE9; des &#xE9;tablissements et Services sociaux';
const result = checkData.cleanText(text);
chai.assert.isString(result);
chai.expect(result).to.equal('directeur de l\'Agence Nationale de l\'Evaluation et de la qualité des établissements et Services sociaux');
});
});
// Server entry point, imports all server code
import '../imports/startup/server/index.js';
import '/imports/startup/server/index.js';
#!/usr/bin/env bash
~/.nvm/nvm.sh use && cd ./.deploy/ && pm2-meteor deploy
......@@ -107,6 +107,10 @@ array-uniq@^1.0.1:
version "1.0.3"
resolved "https://registry.yarnpkg.com/array-uniq/-/array-uniq-1.0.3.tgz#af6ac877a25cc7f74e058894753858dfdb24fdb6"
arrayiffy-if-string@*:
version "1.0.1"
resolved "https://registry.yarnpkg.com/arrayiffy-if-string/-/arrayiffy-if-string-1.0.1.tgz#8ef2800277af1314ac2751c026a52c8964050d8c"
arrify@^1.0.0:
version "1.0.1"
resolved "https://registry.yarnpkg.com/arrify/-/arrify-1.0.1.tgz#898508da2226f380df904728456849c1501a4b0d"
......@@ -523,6 +527,19 @@ charset-parser@^0.2.0:
version "0.2.0"
resolved "https://registry.yarnpkg.com/charset-parser/-/charset-parser-0.2.0.tgz#230901088f5f6cb1659a8b5aaad2572db7a75b6b"
check-types-mini@^2.2.1:
version "2.2.1"
resolved "https://registry.yarnpkg.com/check-types-mini/-/check-types-mini-2.2.1.tgz#c2e20c2af323afd5680a3d8659884002119666b3"
dependencies:
arrayiffy-if-string "*"
lodash.clonedeep "*"
lodash.includes "*"
lodash.intersection "*"
lodash.isplainobject "*"
lodash.pullall "*"
object-assign "*"
type-detect "^4.0.0"
cheerio@^0.22.0:
version "0.22.0"
resolved "https://registry.yarnpkg.com/cheerio/-/cheerio-0.22.0.tgz#a9baa860a3f9b595a6b81b1a86873121ed3a269e"
......@@ -725,6 +742,10 @@ css-what@2.1:
version "2.1.0"
resolved "https://registry.yarnpkg.com/css-what/-/css-what-2.1.0.tgz#9467d032c38cfaefb9f2d79501253062f87fa1bd"
curl-quotes@^0.1.0:
version "0.1.0"
resolved "https://registry.yarnpkg.com/curl-quotes/-/curl-quotes-0.1.0.tgz#b0f26f44df24f12d9258a3e02c055959a61707c5"
d@1:
version "1.0.0"
resolved "https://registry.yarnpkg.com/d/-/d-1.0.0.tgz#754bb5bfe55451da69a58b94d45f4c5b0462d58f"
......@@ -832,6 +853,23 @@ detect-indent@^4.0.0:
dependencies:
repeating "^2.0.0"
detergent@^2.28.3:
version "2.28.3"
resolved "https://registry.yarnpkg.com/detergent/-/detergent-2.28.3.tgz#e85bbcae064e57d1e998a49a3e0888275969cade"
dependencies:
curl-quotes "^0.1.0"
easy-replace "^2.10.0"
he "^1.0.0"
lodash.clonedeep "^4.5.0"
lodash.isplainobject "^4.0.6"
lodash.toarray "^4.4.0"
lower-case "^1.1.4"
object-assign "^4.1.1"
string "^3.3.1"
typographic-en-dashes "^1.0.1"
unicode-dragon "^0.1.3"
upper-case "^1.1.3"
diffie-hellman@^5.0.0:
version "5.0.2"
resolved "https://registry.yarnpkg.com/diffie-hellman/-/diffie-hellman-5.0.2.tgz#b5835739270cfe26acf632099fded2a07f209e5e"
......@@ -890,6 +928,14 @@ double-ended-queue@^2.1.0-0:
version "2.1.0-0"
resolved "https://registry.yarnpkg.com/double-ended-queue/-/double-ended-queue-2.1.0-0.tgz#103d3527fd31528f40188130c841efdd78264e5c"
easy-replace@^2.10.0:
version "2.10.1"
resolved "https://registry.yarnpkg.com/easy-replace/-/easy-replace-2.10.1.tgz#77af98ef9adaed76dfaf3d4d899bd63ab86c0f9b"
dependencies:
check-types-mini "^2.2.1"
lodash.toarray "*"
lodash.without "*"
ecc-jsbn@~0.1.1:
version "0.1.1"
resolved "https://registry.yarnpkg.com/ecc-jsbn/-/ecc-jsbn-0.1.1.tgz#0fc73a9ed5f0d53c38193398523ef7e543777505"
......@@ -1459,6 +1505,10 @@ hawk@~3.1.3:
hoek "2.x.x"
sntp "1.x.x"
he@^1.0.0, he@^1.1.1:
version "1.1.1"
resolved "https://registry.yarnpkg.com/he/-/he-1.1.1.tgz#93410fd21b009735151f8868c2f271f3427e23fd"
hmac-drbg@^1.0.0:
version "1.0.1"
resolved "https://registry.yarnpkg.com/hmac-drbg/-/hmac-drbg-1.0.1.tgz#d2745701025a6c775a6c545793ed502fc0c649a1"
......@@ -1691,7 +1741,7 @@ isstream@~0.1.2:
version "0.1.2"
resolved "https://registry.yarnpkg.com/isstream/-/isstream-0.1.2.tgz#47e63f7af55afa6f92e1500e690eb8b8529c099a"
izitoast@^1.1.4:
izitoast@^1.1.1:
version "1.1.4"
resolved "https://registry.yarnpkg.com/izitoast/-/izitoast-1.1.4.tgz#a9f0ab7d9532dde9041a9657df15de6344b9e9d5"
......@@ -1804,6 +1854,10 @@ lodash.bind@^4.1.4:
version "4.2.1"
resolved "https://registry.yarnpkg.com/lodash.bind/-/lodash.bind-4.2.1.tgz#7ae3017e939622ac31b7d7d7dcb1b34db1690d35"
lodash.clonedeep@*, lodash.clonedeep@^4.5.0:
version "4.5.0"
resolved "https://registry.yarnpkg.com/lodash.clonedeep/-/lodash.clonedeep-4.5.0.tgz#e23f3f9c4f8fbdde872529c1071857a086e5ccef"
lodash.cond@^4.3.0:
version "4.5.2"
resolved "https://registry.yarnpkg.com/lodash.cond/-/lodash.cond-4.5.2.tgz#f471a1da486be60f6ab955d17115523dd1d255d5"
......@@ -1832,10 +1886,22 @@ lodash.get@^4.4.2:
version "4.4.2"
resolved "https://registry.yarnpkg.com/lodash.get/-/lodash.get-4.4.2.tgz#2d177f652fa31e939b4438d5341499dfa3825e99"
lodash.includes@*:
version "4.3.0"
resolved "https://registry.yarnpkg.com/lodash.includes/-/lodash.includes-4.3.0.tgz#60bb98a87cb923c68ca1e51325483314849f553f"
lodash.intersection@*:
version "4.4.0"
resolved "https://registry.yarnpkg.com/lodash.intersection/-/lodash.intersection-4.4.0.tgz#0a11ba631d0e95c23c7f2f4cbb9a692ed178e705"
lodash.isempty@^4.4.0:
version "4.4.0"
resolved "https://registry.yarnpkg.com/lodash.isempty/-/lodash.isempty-4.4.0.tgz#6f86cbedd8be4ec987be9aaf33c9684db1b31e7e"
lodash.isplainobject@*, lodash.isplainobject@^4.0.6:
version "4.0.6"
resolved "https://registry.yarnpkg.com/lodash.isplainobject/-/lodash.isplainobject-4.0.6.tgz#7c526a52d89b45c45cc690b88163be0497f550cb"
lodash.map@^4.4.0:
version "4.6.0"
resolved "https://registry.yarnpkg.com/lodash.map/-/lodash.map-4.6.0.tgz#771ec7839e3473d9c4cde28b19394c3562f4f6d3"
......@@ -1852,6 +1918,10 @@ lodash.pick@^4.2.1:
version "4.4.0"
resolved "https://registry.yarnpkg.com/lodash.pick/-/lodash.pick-4.4.0.tgz#52f05610fff9ded422611441ed1fc123a03001b3"
lodash.pullall@*:
version "4.2.0"
resolved "https://registry.yarnpkg.com/lodash.pullall/-/lodash.pullall-4.2.0.tgz#9d98b8518b7c965b0fae4099bd9fb7df8bbf38ba"
lodash.reduce@^4.4.0:
version "4.6.0"
resolved "https://registry.yarnpkg.com/lodash.reduce/-/lodash.reduce-4.6.0.tgz#f1ab6b839299ad48f784abbf476596f03b914d3b"
......@@ -1864,10 +1934,18 @@ lodash.some@^4.4.0:
version "4.6.0"
resolved "https://registry.yarnpkg.com/lodash.some/-/lodash.some-4.6.0.tgz#1bb9f314ef6b8baded13b549169b2a945eb68e4d"
lodash.toarray@*, lodash.toarray@^4.4.0:
version "4.4.0"
resolved "https://registry.yarnpkg.com/lodash.toarray/-/lodash.toarray-4.4.0.tgz#24c4bfcd6b2fba38bfd0594db1179d8e9b656561"
lodash.trimend@^4.5.1:
version "4.5.1"
resolved "https://registry.yarnpkg.com/lodash.trimend/-/lodash.trimend-4.5.1.tgz#12804437286b98cad8996b79414e11300114082f"
lodash.without@*:
version "4.4.0"
resolved "https://registry.yarnpkg.com/lodash.without/-/lodash.without-4.4.0.tgz#3cd4574a00b67bae373a94b748772640507b7aac"
lodash@2.4.2:
version "2.4.2"
resolved "https://registry.yarnpkg.com/lodash/-/lodash-2.4.2.tgz#fadd834b9683073da179b3eae6d9c0d15053f73e"
......@@ -1890,6 +1968,10 @@ loose-envify@^1.0.0:
dependencies:
js-tokens "^3.0.0"
lower-case@^1.1.4:
version "1.1.4"
resolved "https://registry.yarnpkg.com/lower-case/-/lower-case-1.1.4.tgz#9a2cabd1b9e8e0ae993a4bf7d5875c39c42e8eac"
media-typer@0.3.0:
version "0.3.0"
resolved "https://registry.yarnpkg.com/media-typer/-/media-typer-0.3.0.tgz#8710d7af0aa626f8fffa1ce00168545263255748"
......@@ -2074,7 +2156,7 @@ oauth-sign@~0.8.1:
version "0.8.2"
resolved "https://registry.yarnpkg.com/oauth-sign/-/oauth-sign-0.8.2.tgz#46a6ab7f0aead8deae9ec0565780b7d4efeb9d43"
object-assign@^4.0.1, object-assign@^4.1.0:
object-assign@*, object-assign@^4.0.1, object-assign@^4.1.0, object-assign@^4.1.1:
version "4.1.1"
resolved "https://registry.yarnpkg.com/object-assign/-/object-assign-4.1.1.tgz#2109adc7965887cfc05cbbd442cac8bfbb360863"
......@@ -2566,12 +2648,6 @@ slice-ansi@0.0.4:
version "0.0.4"
resolved "https://registry.yarnpkg.com/slice-ansi/-/slice-ansi-0.0.4.tgz#edbf8903f66f7ce2f8eafd6ceed65e264c831b35"
slug@^0.9.1:
version "0.9.1"
resolved "https://registry.yarnpkg.com/slug/-/slug-0.9.1.tgz#af08f608a7c11516b61778aa800dce84c518cfda"
dependencies:
unicode ">= 0.3.1"
sntp@1.x.x:
version "1.0.9"
resolved "https://registry.yarnpkg.com/sntp/-/sntp-1.0.9.tgz#6541184cc90aeea6c6e7b35e2659082443c66198"
......@@ -2642,6 +2718,12 @@ stream-browserify@^2.0.1:
inherits "~2.0.1"
readable-stream "^2.0.2"
string-unfancy@^1.0.9:
version "1.0.9"
resolved "https://registry.yarnpkg.com/string-unfancy/-/string-unfancy-1.0.9.tgz#f4dbe24db6409df7deb0ff95b643f26ebd3e26ae"
dependencies:
he "^1.1.1"
string-width@^1.0.1, string-width@^1.0.2:
version "1.0.2"
resolved "https://registry.yarnpkg.com/string-width/-/string-width-1.0.2.tgz#118bdf5b8cdc51a2a7e70d211e07e2b0b9b107d3"
......@@ -2657,6 +2739,10 @@ string-width@^2.0.0:
is-fullwidth-code-point "^2.0.0"
strip-ansi "^3.0.0"
string@^3.3.1:
version "3.3.3"
resolved "https://registry.yarnpkg.com/string/-/string-3.3.3.tgz#5ea211cd92d228e184294990a6cc97b366a77cb0"
string_decoder@^1.0.1, string_decoder@~1.0.0:
version "1.0.2"
resolved "https://registry.yarnpkg.com/string_decoder/-/string_decoder-1.0.2.tgz#b29e1f4e1125fa97a10382b8a533737b7491e179"
......@@ -2782,6 +2868,10 @@ type-check@~0.3.2:
dependencies:
prelude-ls "~1.1.2"
type-detect@^4.0.0:
version "4.0.3"
resolved "https://registry.yarnpkg.com/type-detect/-/type-detect-4.0.3.tgz#0e3f2670b44099b0b46c284d136a7ef49c74c2ea"
type-is@^1.6.14:
version "1.6.15"
resolved "https://registry.yarnpkg.com/type-is/-/type-is-1.6.15.tgz#cab10fb4909e441c82842eafe1ad646c81804410"
......@@ -2793,6 +2883,10 @@ typedarray@^0.0.6:
version "0.0.6"
resolved "https://registry.yarnpkg.com/typedarray/-/typedarray-0.0.6.tgz#867ac74e3864187b1d3d47d996a78ec5c8830777"
typographic-en-dashes@^1.0.1:
version "1.0.1"
resolved "https://registry.yarnpkg.com/typographic-en-dashes/-/typographic-en-dashes-1.0.1.tgz#a37739fdc43ed38c351bbf70a80c2503ccb022e0"
uglify-js@^2.6:
version "2.8.28"
resolved "https://registry.yarnpkg.com/uglify-js/-/uglify-js-2.8.28.tgz#e335032df9bb20dcb918f164589d5af47f38834a"
......@@ -2821,14 +2915,18 @@ underscore@1.8.3:
version "1.8.3"
resolved "https://registry.yarnpkg.com/underscore/-/underscore-1.8.3.tgz#4f3fb53b106e6097fcf9cb4109f2a5e9bdfa5022"
"unicode@>= 0.3.1":
version "9.0.1"
resolved "https://registry.yarnpkg.com/unicode/-/unicode-9.0.1.tgz#104706272c6464c574801be1b086f7245cf25158"
unicode-dragon@^0.1.3:
version "0.1.3"
resolved "https://registry.yarnpkg.com/unicode-dragon/-/unicode-dragon-0.1.3.tgz#ef7d25028690cc22170de8f02f93814494543455"
unpipe@~1.0.0:
version "1.0.0"
resolved "https://registry.yarnpkg.com/unpipe/-/unpipe-1.0.0.tgz#b2bf4ee8514aae6165b4817829d21b2ef49904ec"
upper-case@^1.1.3:
version "1.1.3"
resolved "https://registry.yarnpkg.com/upper-case/-/upper-case-1.1.3.tgz#f6b4501c2ec4cdd26ba78be7222961de77621598"
url@^0.11.0:
version "0.11.0"
resolved "https://registry.yarnpkg.com/url/-/url-0.11.0.tgz#3838e97cfc60521eb73c525a8e55bfdd9e2e28f1"
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment