Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
I
IDsearch
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
IDsearch
IDsearch
Commits
fef1fde1
Commit
fef1fde1
authored
Jul 07, 2017
by
Nacim Goura
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
finish demo
parent
fc0518f2
Changes
14
Hide whitespace changes
Inline
Side-by-side
Showing
14 changed files
with
101 additions
and
238 deletions
+101
-238
configCollection.js
imports/api/config/configCollection.js
+2
-0
crawlApi.js
imports/api/crawl/api/server/crawlApi.js
+9
-7
crawlFacebook.js
imports/api/crawl/api/server/crawlFacebook.js
+4
-4
crawlTwitter.js
imports/api/crawl/api/server/crawlTwitter.js
+6
-5
crawlGeneric.js
imports/api/crawl/crawlGeneric.js
+9
-0
crawlNetwork.js
imports/api/crawl/network/server/crawlNetwork.js
+4
-2
formWebsiteCrawlSchema.js
imports/api/crawl/website/formWebsiteCrawlSchema.js
+1
-1
crawlWebsite.js
imports/api/crawl/website/server/crawlWebsite.js
+14
-15
methods.js
imports/api/indexation/methods.js
+43
-30
elasticSearchConfig.js
imports/api/indexation/server/elasticSearchConfig.js
+0
-168
indexGeneric.js
imports/api/indexation/server/indexGeneric.js
+6
-2
website.html
imports/ui/pages/admin/indexation/website/website.html
+1
-1
website.js
imports/ui/pages/admin/indexation/website/website.js
+1
-1
settings.json
settings.json
+1
-2
No files found.
imports/api/config/configCollection.js
View file @
fef1fde1
...
...
@@ -38,10 +38,12 @@ SimpleSchema.configCollection = new SimpleSchema({
'
listConfig.$.breadcrumb
'
:
{
type
:
String
,
label
:
'
Element du breadcrumb
'
,
required
:
false
,
},
'
listConfig.$.forbiddenWordString
'
:
{
type
:
String
,
label
:
'
Mot non indexable (à séparer par une virgule)
'
,
required
:
false
,
autoform
:
{
class
:
'
forbiddenWordWebsite
'
,
},
...
...
imports/api/crawl/api/server/crawlApi.js
View file @
fef1fde1
import
CrawlFacebook
from
'
./crawlFacebook
'
;
import
CrawlTwitter
from
'
./crawlTwitter
'
;
import
CrawlGeneric
from
'
/imports/api/crawl/crawlGeneric
'
;
import
CrawlFacebook
from
'
/imports/api/crawl/api/server/crawlFacebook
'
;
import
CrawlTwitter
from
'
/imports/api/crawl/api/server/crawlTwitter
'
;
export
default
class
crawlApi
{
export
default
class
crawlApi
extends
CrawlGeneric
{
constructor
(
config
)
{
switch
(
config
.
type
)
{
constructor
(
data
)
{
super
();
switch
(
data
.
type
)
{
case
'
facebook
'
:
return
new
CrawlFacebook
(
config
);
return
new
CrawlFacebook
(
this
.
config
,
data
);
case
'
twitter
'
:
return
new
CrawlTwitter
(
config
);
return
new
CrawlTwitter
(
this
.
config
,
data
);
default
:
throw
new
Meteor
.
Error
(
'
Error
'
,
'
Aucun type pour l
\'
API indiqué
'
);
}
...
...
imports/api/crawl/api/server/crawlFacebook.js
View file @
fef1fde1
...
...
@@ -5,13 +5,13 @@ import checkData from '/imports/utils/checkData';
export
default
class
CrawlFacebook
{
constructor
(
config
)
{
constructor
(
data
,
config
)
{
this
.
config
=
config
;
console
.
log
(
'
crawl Facebook
'
);
this
.
listDataForIndex
=
[];
if
(
config
.
content
)
{
this
.
content
=
JSON
.
parse
(
config
.
content
);
if
(
data
.
content
)
{
this
.
content
=
JSON
.
parse
(
data
.
content
);
return
this
.
start
();
}
throw
new
Meteor
.
Error
(
'
Error
'
,
'
aucune donnée Facebook!
'
);
...
...
@@ -71,7 +71,7 @@ export default class CrawlFacebook {
this
.
listDataForIndex
.
push
({
index
:
{
_index
:
Meteor
.
settings
.
private
.
elasticsearch
.
esIndex
,
_type
:
Meteor
.
settings
.
private
.
elasticsearch
.
esType
,
_type
:
this
.
config
.
userId
,
_id
:
dataForIndex
.
url
,
},
});
...
...
imports/api/crawl/api/server/crawlTwitter.js
View file @
fef1fde1
...
...
@@ -6,9 +6,10 @@ import checkData from '/imports/utils/checkData';
export
default
class
CrawlTwitter
{
constructor
(
config
)
{
constructor
(
config
,
data
)
{
console
.
log
(
'
crawl Twittter
'
);
this
.
config
=
config
;
this
.
data
=
data
;
this
.
listDataForIndex
=
[];
this
.
client
=
new
Twitter
({
...
...
@@ -27,7 +28,7 @@ export default class CrawlTwitter {
*/
async
start
()
{
try
{
const
tweets
=
await
this
.
client
.
get
(
'
statuses/user_timeline
'
,
{
screen_name
:
this
.
config
.
idPage
});
const
tweets
=
await
this
.
client
.
get
(
'
statuses/user_timeline
'
,
{
screen_name
:
this
.
data
.
idPage
});
return
this
.
parseData
(
tweets
);
}
catch
(
e
)
{
throw
new
Meteor
.
Error
(
'
Error
'
,
'
Erreur lors l
\'
utilisation de l
\'
API Twitter
'
,
'
statuses/user_timeline
'
);
...
...
@@ -44,8 +45,8 @@ export default class CrawlTwitter {
const
dataForIndex
=
{
tag
:
'
api
'
,
apiName
:
'
twitter
'
,
domain
:
this
.
config
.
idPage
,
url
:
`https://twitter.com/
${
this
.
config
.
idPage
}
/status/
${
item
.
id_str
}
`
,
domain
:
this
.
data
.
idPage
,
url
:
`https://twitter.com/
${
this
.
data
.
idPage
}
/status/
${
item
.
id_str
}
`
,
};
...
...
@@ -65,7 +66,7 @@ export default class CrawlTwitter {
this
.
listDataForIndex
.
push
({
index
:
{
_index
:
Meteor
.
settings
.
private
.
elasticsearch
.
esIndex
,
_type
:
Meteor
.
settings
.
private
.
elasticsearch
.
esType
,
_type
:
this
.
config
.
userId
,
_id
:
dataForIndex
.
url
,
},
});
...
...
imports/api/crawl/crawlGeneric.js
0 → 100644
View file @
fef1fde1
import
{
getConfig
}
from
'
/imports/api/config/methods
'
;
export
default
class
crawlGeneric
{
constructor
()
{
this
.
config
=
getConfig
();
}
}
imports/api/crawl/network/server/crawlNetwork.js
View file @
fef1fde1
import
{
Meteor
}
from
'
meteor/meteor
'
;
import
CrawlGeneric
from
'
/imports/api/crawl/crawlGeneric
'
;
import
fs
from
'
fs
'
;
import
_
from
'
lodash
'
;
import
Files
from
'
/imports/api/crawl/network/networkCollection
'
;
export
default
class
CrawlNetwork
{
export
default
class
CrawlNetwork
extends
CrawlGeneric
{
constructor
()
{
super
();
this
.
files
=
Files
.
find
({
userId
:
Meteor
.
userId
()
}).
fetch
();
this
.
listDataForIndex
=
[];
console
.
log
(
'
init crawl network!
'
);
...
...
@@ -23,7 +25,7 @@ export default class CrawlNetwork {
this
.
listDataForIndex
.
push
({
index
:
{
_index
:
Meteor
.
settings
.
private
.
elasticsearch
.
esIndex
,
_type
:
Meteor
.
settings
.
private
.
elasticsearch
.
esType
,
_type
:
this
.
config
.
userId
,
_id
:
file
.
name
,
},
});
...
...
imports/api/crawl/website/formWebsiteCrawlSchema.js
View file @
fef1fde1
...
...
@@ -8,7 +8,7 @@ export default new SimpleSchema({
type
:
String
,
label
:
'
Url du site ou du sitemap à indexer :
'
,
},
c
onfig
:
{
nameC
onfig
:
{
type
:
String
,
label
:
'
Configuration à appliquer :
'
,
},
...
...
imports/api/crawl/website/server/crawlWebsite.js
View file @
fef1fde1
import
url
from
'
url
'
;
import
_
from
'
lodash
'
;
import
CrawlGeneric
from
'
/imports/api/crawl/crawlGeneric
'
;
import
{
Meteor
}
from
'
meteor/meteor
'
;
import
Crawler
from
'
crawler
'
;
import
Sitemapper
from
'
sitemapper
'
;
import
checkData
from
'
/imports/utils/checkData
'
;
export
default
class
crawlWebsite
{
export
default
class
crawlWebsite
extends
CrawlGeneric
{
/**
* crawl list urls
...
...
@@ -13,13 +15,9 @@ export default class crawlWebsite {
* @returns {Promise}
*/
constructor
(
data
)
{
super
();
this
.
urlWebsite
=
data
.
urlWebsite
;
this
.
config
=
{
domain
:
url
.
parse
(
this
.
urlWebsite
).
hostname
,
forbiddenWord
:
[],
};
this
.
config
=
JSON
.
parse
(
data
.
config
);
this
.
config
.
crawl
=
_
.
find
(
this
.
config
.
listConfig
,
n
=>
n
.
domain
===
data
.
nameConfig
);
return
this
.
start
();
}
...
...
@@ -58,15 +56,17 @@ export default class crawlWebsite {
res
.
$
(
'
a
'
).
each
((
index
,
a
)
=>
{
const
urlHref
=
res
.
$
(
a
).
attr
(
'
href
'
);
if
(
urlHref
)
{
cons
t
toQueueUrl
=
url
.
resolve
(
res
.
request
.
uri
.
href
,
res
.
$
(
a
).
attr
(
'
href
'
));
le
t
toQueueUrl
=
url
.
resolve
(
res
.
request
.
uri
.
href
,
res
.
$
(
a
).
attr
(
'
href
'
));
// check if same domain name
if
(
toQueueUrl
.
includes
(
this
.
config
.
domain
))
{
toQueueUrl
=
toQueueUrl
.
replace
(
'
https
'
,
'
http
'
);
// console.log(toQueueUrl, toQueueUrl.includes(this.config.crawl.domain));
if
(
toQueueUrl
.
includes
(
this
.
config
.
crawl
.
domain
))
{
// clean url
// toQueueUrl = checkData.cleanUrl(toQueueUrl);
// check if already visited
if
(
!
this
.
listUrlAlreadyVisited
.
includes
(
toQueueUrl
))
{
// check if url has forbidden word
if
(
!
new
RegExp
(
this
.
config
.
forbiddenWord
.
join
(
'
|
'
)).
test
(
toQueueUrl
))
{
if
(
!
new
RegExp
(
this
.
config
.
crawl
.
forbiddenWord
.
join
(
'
|
'
)).
test
(
toQueueUrl
))
{
// check if url is good for crawl
if
(
checkData
.
checkCrawlUrl
(
toQueueUrl
))
{
// add url in already visited urls
...
...
@@ -109,13 +109,12 @@ export default class crawlWebsite {
* @param listPdf
*/
parseData
(
$
,
currentUrl
,
listPdf
)
{
const
_this
=
this
;
const
body
=
$
(
'
body
'
);
body
.
html
(
checkData
.
cleanHtml
(
body
.
html
()));
const
title
=
checkData
.
cleanText
(
$
(
'
title
'
).
text
());
const
dataForIndex
=
{
tag
:
'
site
'
,
domain
:
this
.
config
.
domain
,
domain
:
this
.
config
.
crawl
.
domain
,
title
,
title_suggest
:
{
input
:
title
,
...
...
@@ -128,8 +127,8 @@ export default class crawlWebsite {
createdAt
:
new
Date
(),
};
if
(
$
(
_this
.
config
.
breadcrumb
).
text
().
length
)
{
dataForIndex
.
breadcrumb
=
checkData
.
slugText
(
checkData
.
cleanText
(
$
(
_this
.
config
.
breadcrumb
).
text
()));
if
(
$
(
this
.
config
.
crawl
.
breadcrumb
).
text
().
length
)
{
dataForIndex
.
breadcrumb
=
checkData
.
slugText
(
checkData
.
cleanText
(
$
(
this
.
config
.
crawl
.
breadcrumb
).
text
()));
}
if
(
$
(
'
h1
'
).
text
().
length
)
{
...
...
@@ -147,7 +146,7 @@ export default class crawlWebsite {
this
.
listDataForIndex
.
push
({
index
:
{
_index
:
Meteor
.
settings
.
private
.
elasticsearch
.
esIndex
,
_type
:
Meteor
.
settings
.
private
.
elasticsearch
.
esType
,
_type
:
this
.
config
.
userId
,
_id
:
currentUrl
,
},
});
...
...
imports/api/indexation/methods.js
View file @
fef1fde1
...
...
@@ -9,35 +9,48 @@ import IndexNetwork from '/imports/api/indexation/server/indexNetwork';
import
formWebsiteCrawlSchema
from
'
/imports/api/crawl/website/formWebsiteCrawlSchema
'
;
import
formApiCrawlSchema
from
'
/imports/api/crawl/api/formApiCrawlSchema
'
;
export
function
initIndexElastic
()
{
this
.
unblock
();
const
index
=
new
IndexGeneric
();
return
index
.
initElastic
();
}
export
function
reIndexElastic
()
{
this
.
unblock
();
const
index
=
new
IndexGeneric
();
return
index
.
reIndex
();
}
export
function
indexWebsite
(
data
)
{
check
(
data
,
Object
);
formWebsiteCrawlSchema
.
validate
(
data
);
const
index
=
new
IndexWebsite
();
return
index
.
start
(
data
);
}
export
function
indexApi
(
data
)
{
check
(
data
,
Object
);
console
.
log
(
data
);
formApiCrawlSchema
.
validate
(
data
);
const
index
=
new
IndexApi
();
return
index
.
start
(
data
);
}
export
function
indexNetwork
()
{
const
index
=
new
IndexNetwork
();
return
index
.
start
();
}
export
function
removeFileNetwork
(
id
)
{
check
(
id
,
String
);
Files
.
remove
({
_id
:
id
});
}
Meteor
.
methods
({
initIndexElastic
()
{
this
.
unblock
();
const
index
=
new
IndexGeneric
();
return
index
.
initElastic
();
},
reIndexElastic
()
{
this
.
unblock
();
const
index
=
new
IndexGeneric
();
return
index
.
reIndex
();
},
indexWebsite
(
data
)
{
check
(
data
,
Object
);
formWebsiteCrawlSchema
.
validate
(
data
);
const
index
=
new
IndexWebsite
();
return
index
.
start
(
data
);
},
indexApi
(
data
)
{
check
(
data
,
Object
);
formApiCrawlSchema
.
validate
(
data
);
const
index
=
new
IndexApi
();
return
index
.
start
(
data
);
},
indexNetwork
()
{
const
indexNetwork
=
new
IndexNetwork
();
return
indexNetwork
.
start
();
},
removeFileNetwork
(
id
)
{
check
(
id
,
String
);
Files
.
remove
({
_id
:
id
});
},
initIndexElastic
,
reIndexElastic
,
indexWebsite
,
indexApi
,
indexNetwork
,
removeFileNetwork
,
});
imports/api/indexation/server/elasticSearchConfig.js
deleted
100644 → 0
View file @
fc0518f2
// analyser
exports
.
analyser
=
{
settings
:
{
analysis
:
{
filter
:
{
// suppression de ces mots pour diminuer le bruit
french_elision
:
{
type
:
'
elision
'
,
articles_case
:
true
,
articles
:
[
'
l
'
,
'
m
'
,
'
t
'
,
'
qu
'
,
'
n
'
,
'
s
'
,
'
j
'
,
'
d
'
,
'
c
'
,
'
jusqu
'
,
'
quoiqu
'
,
'
lorsqu
'
,
'
puisqu
'
,
],
},
// synonyme
french_synonym
:
{
type
:
'
synonym
'
,
ignore_case
:
true
,
expand
:
true
,
synonyms
:
[
'
gosse, enfant
'
,
'
pmi, protection maternelle et infantile
'
,
],
},
// radical des mots
french_stemmer
:
{
type
:
'
stemmer
'
,
language
:
'
light_french
'
,
},
},
analyzer
:
{
// français elevé
french_heavy
:
{
tokenizer
:
'
icu_tokenizer
'
,
filter
:
[
'
french_elision
'
,
'
icu_folding
'
,
'
french_synonym
'
,
'
french_stemmer
'
,
'
lowercase
'
,
'
asciifolding
'
,
],
},
// français léger
french_light
:
{
tokenizer
:
'
icu_tokenizer
'
,
char_filter
:
[
'
html_strip
'
,
],
filter
:
[
'
french_elision
'
,
'
icu_folding
'
,
'
lowercase
'
,
'
asciifolding
'
,
],
},
// analyzer for url
url_analyzer
:
{
tokenizer
:
'
uax_url_email
'
,
filter
:
[
'
french_elision
'
,
'
icu_folding
'
,
'
lowercase
'
,
],
},
},
},
},
};
// mapping
exports
.
mapping
=
{
properties
:
{
tag
:
{
type
:
'
keyword
'
,
},
domain
:
{
type
:
'
keyword
'
,
},
apiName
:
{
type
:
'
keyword
'
,
},
title
:
{
type
:
'
text
'
,
analyzer
:
'
french_light
'
,
fields
:
{
stemmed
:
{
type
:
'
text
'
,
analyzer
:
'
french_heavy
'
,
},
},
},
title_suggest
:
{
type
:
'
completion
'
,
analyzer
:
'
french_light
'
,
max_input_length
:
100
,
},
description
:
{
type
:
'
text
'
,
analyzer
:
'
french_light
'
,
fields
:
{
stemmed
:
{
type
:
'
text
'
,
analyzer
:
'
french_heavy
'
,
},
},
},
body
:
{
type
:
'
text
'
,
analyzer
:
'
french_light
'
,
fields
:
{
stemmed
:
{
type
:
'
text
'
,
analyzer
:
'
french_heavy
'
,
},
},
},
html
:
{
type
:
'
text
'
,
analyzer
:
'
french_light
'
,
},
url
:
{
type
:
'
text
'
,
analyzer
:
'
url_analyzer
'
,
},
urlText
:
{
type
:
'
text
'
,
analyzer
:
'
french_light
'
,
fields
:
{
stemmed
:
{
type
:
'
text
'
,
analyzer
:
'
french_heavy
'
,
},
},
},
h1
:
{
type
:
'
text
'
,
analyzer
:
'
french_light
'
,
},
h2
:
{
type
:
'
text
'
,
analyzer
:
'
french_light
'
,
},
breadcrumb
:
{
type
:
'
text
'
,
analyzer
:
'
french_light
'
,
},
listPdf
:
{
type
:
'
text
'
,
analyzer
:
'
url_analyzer
'
,
},
createdAt
:
{
type
:
'
date
'
,
},
},
};
imports/api/indexation/server/indexGeneric.js
View file @
fef1fde1
...
...
@@ -2,12 +2,16 @@
import
{
Meteor
}
from
'
meteor/meteor
'
;
import
elastic
from
'
/imports/libs/elasticsearch/elasticsearch
'
;
import
configElastic
from
'
/imports/libs/elasticsearch/elasticSearchConfig
'
;
import
{
getConfig
}
from
'
/imports/api/config/methods
'
;
const
esIndex
=
Meteor
.
settings
.
private
.
elasticsearch
.
esIndex
;
const
esType
=
Meteor
.
settings
.
private
.
elasticsearch
.
esType
;
export
default
class
IndexGeneric
{
constructor
()
{
this
.
config
=
getConfig
();
}
indexByBulk
(
data
,
hasFile
)
{
return
elastic
.
bulk
(
data
,
hasFile
);
}
...
...
@@ -32,7 +36,7 @@ export default class IndexGeneric {
const
mapping
=
configElastic
.
mapping
;
await
elastic
.
initAnalyzer
(
esIndex
,
analyser
);
await
elastic
.
initMapping
(
esIndex
,
esType
,
mapping
);
await
elastic
.
initMapping
(
esIndex
,
this
.
config
.
userId
,
mapping
);
return
elastic
.
createPipeline
();
}
}
imports/ui/pages/admin/indexation/website/website.html
View file @
fef1fde1
...
...
@@ -7,7 +7,7 @@
<h4
class=
"text-center"
>
Gestion de l'indexation des sites
</h4>
{{#autoForm id="formWebsiteCrawl" schema=formWebsiteCrawlSchema buttonContent="Indexer" type="method" meteormethod="indexWebsite" }}
{{> afQuickField name='urlWebsite'}}
{{> afQuickField name='
c
onfig' options=optionSelectConfig }}
{{> afQuickField name='
nameC
onfig' options=optionSelectConfig }}
<button
type=
"submit"
class=
"btn btn-success"
>
Indexer
</button>
{{/autoForm}}
...
...
imports/ui/pages/admin/indexation/website/website.js
View file @
fef1fde1
...
...
@@ -24,7 +24,7 @@ Template.siteIndexationTpl.helpers({
config
.
listConfig
.
forEach
((
item
)
=>
{
options
.
push
({
label
:
item
.
domain
,
value
:
JSON
.
stringify
(
item
)
,
value
:
item
.
domain
,
});
});
}
...
...
settings.json
View file @
fef1fde1
...
...
@@ -2,8 +2,7 @@
"private"
:
{
"elasticsearch"
:
{
"host"
:
"localhost:9201"
,
"esIndex"
:
"idsearch"
,
"esType"
:
"haute-savoie"
"esIndex"
:
"idsearch"
}
},
"public"
:
{
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment