hw2: done ex1, ex2a, ex2b
This commit is contained in:
parent
be9046ca4c
commit
d060e66518
10 changed files with 28659 additions and 0 deletions
BIN
Assignment2_part1/Assignment2_part1.pdf
Normal file
BIN
Assignment2_part1/Assignment2_part1.pdf
Normal file
Binary file not shown.
30
Assignment2_part1/convert.sh
Executable file
30
Assignment2_part1/convert.sh
Executable file
|
@ -0,0 +1,30 @@
|
|||
#!/bin/sh
|
||||
set -e
|
||||
|
||||
SCRIPT_DIR=$(cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
|
||||
|
||||
input="$SCRIPT_DIR/data/restaurants.csv"
|
||||
output="$SCRIPT_DIR/data/restaurants.jsonl"
|
||||
|
||||
|
||||
# In order:
|
||||
# - Convert CSV to JSON
|
||||
# - Convert JSON array in JSON lines notation
|
||||
# - Remove last line (which is all `null`)
|
||||
cat "$input" | jq -s --raw-input --raw-output \
|
||||
'split("\n") | .[1:-1] | map(split(",")) |
|
||||
map({
|
||||
"id": .[0],
|
||||
"name": .[1],
|
||||
"city": .[2],
|
||||
"location": {
|
||||
"lon": .[8] | sub("^\"\\["; "") | sub("\\s*"; "") | tonumber,
|
||||
"lat": .[9] | sub("\\]\"$"; "") | sub("\\s*"; "") | tonumber,
|
||||
},
|
||||
"averageCostForTwo": .[3],
|
||||
"aggregateRating": .[4],
|
||||
"ratingText": .[5],
|
||||
"votes": .[6],
|
||||
"date": .[7]
|
||||
})' "$input" | \
|
||||
jq -c '.[]' > "$output"
|
9500
Assignment2_part1/data/restaurants.csv
Normal file
9500
Assignment2_part1/data/restaurants.csv
Normal file
File diff suppressed because it is too large
Load diff
9499
Assignment2_part1/data/restaurants.jsonl
Normal file
9499
Assignment2_part1/data/restaurants.jsonl
Normal file
File diff suppressed because it is too large
Load diff
50
Assignment2_part1/mappings.json
Normal file
50
Assignment2_part1/mappings.json
Normal file
|
@ -0,0 +1,50 @@
|
|||
{
|
||||
"properties": {
|
||||
"id": {
|
||||
"type": "long"
|
||||
},
|
||||
"name": {
|
||||
"type": "text",
|
||||
"fields": {
|
||||
"keyword": {
|
||||
"type": "keyword",
|
||||
"ignore_above": 256
|
||||
}
|
||||
}
|
||||
},
|
||||
"city": {
|
||||
"type": "text",
|
||||
"fields": {
|
||||
"keyword": {
|
||||
"type": "keyword",
|
||||
"ignore_above": 256
|
||||
}
|
||||
}
|
||||
},
|
||||
"location": {
|
||||
"type": "geo_point"
|
||||
},
|
||||
"averageCostForTwo": {
|
||||
"type": "float"
|
||||
},
|
||||
"aggregateRating": {
|
||||
"type": "float"
|
||||
},
|
||||
"ratingText": {
|
||||
"type": "text",
|
||||
"fields": {
|
||||
"keyword": {
|
||||
"type": "keyword",
|
||||
"ignore_above": 256
|
||||
}
|
||||
}
|
||||
},
|
||||
"votes": {
|
||||
"type": "float"
|
||||
},
|
||||
"date": {
|
||||
"type": "date",
|
||||
"format": "date_optional_time"
|
||||
}
|
||||
}
|
||||
}
|
17
Assignment2_part1/queries/query2a.http
Normal file
17
Assignment2_part1/queries/query2a.http
Normal file
|
@ -0,0 +1,17 @@
|
|||
GET /restaurants/_search
|
||||
{
|
||||
"query": {
|
||||
"bool": {
|
||||
"must": [
|
||||
{ "match": { "name": "pizza" } }
|
||||
],
|
||||
"should": [
|
||||
{ "match": { "ratingText": "Very Good" } },
|
||||
{ "match": { "ratingText": "Excellent" } }
|
||||
],
|
||||
"must_not": [
|
||||
{ "match": { "name": "pasta" } }
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
31
Assignment2_part1/queries/query2b.http
Normal file
31
Assignment2_part1/queries/query2b.http
Normal file
|
@ -0,0 +1,31 @@
|
|||
GET /restaurants/_search
|
||||
{
|
||||
"from" : 0,
|
||||
"size" : 5,
|
||||
"sort" : [
|
||||
{ "averageCostForTwo" : "desc" }
|
||||
],
|
||||
"query": {
|
||||
"bool": {
|
||||
"must": [
|
||||
{
|
||||
"range": {
|
||||
"date": {
|
||||
"gte": "2018-01-01",
|
||||
"lte": "2018-12-31"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"geo_distance": {
|
||||
"distance": "20km",
|
||||
"location": {
|
||||
"lat": 33.9259,
|
||||
"lon": -83.3389
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
32
Assignment2_part1/upload.sh
Executable file
32
Assignment2_part1/upload.sh
Executable file
|
@ -0,0 +1,32 @@
|
|||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
|
||||
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
|
||||
|
||||
elastic_dir="$HOME/bin/elasticsearch-8.6.2"
|
||||
elastic_url="https://localhost:9200"
|
||||
crt="$elastic_dir/config/certs/http_ca.crt"
|
||||
|
||||
input="$SCRIPT_DIR/data/restaurants.jsonl"
|
||||
password="GZH*wqNTvQ0WRdrPrpHm"
|
||||
|
||||
# Create index
|
||||
curl --cacert "$crt" -u "elastic:$password" \
|
||||
-X DELETE "$elastic_url/restaurants" | jq . || true
|
||||
curl --cacert "$crt" -u "elastic:$password" \
|
||||
-X PUT "$elastic_url/restaurants" | jq .
|
||||
|
||||
# Upload mappings
|
||||
cat mappings.json | curl --cacert "$crt" -u "elastic:$password" -X POST \
|
||||
--data-binary @- "$elastic_url/restaurants/_mappings/" \
|
||||
-H "Content-Type: application/json" | jq .
|
||||
|
||||
# Upload documents one by one
|
||||
while IFS= read -r line
|
||||
do
|
||||
id=$(echo "$line" | jq '.id | tonumber')
|
||||
echo $line | curl -k --cacert "$crt" -u "elastic:$password" -X PUT \
|
||||
--data-binary @- "$elastic_url/restaurants/_doc/$id" \
|
||||
-H "Content-Type: application/json" | jq ._id &
|
||||
done < "$input"
|
BIN
Assignment2_part2/assignment2_part2.pdf
Normal file
BIN
Assignment2_part2/assignment2_part2.pdf
Normal file
Binary file not shown.
9500
Assignment2_part2/restaurants_extended.csv
Normal file
9500
Assignment2_part2/restaurants_extended.csv
Normal file
File diff suppressed because it is too large
Load diff
Reference in a new issue