hw2: done ex1, ex2a, ex2b

This commit is contained in:
Claudio Maggioni 2023-04-24 09:49:04 +02:00
parent be9046ca4c
commit d060e66518
10 changed files with 28659 additions and 0 deletions

Binary file not shown.

30
Assignment2_part1/convert.sh Executable file
View file

@ -0,0 +1,30 @@
#!/bin/sh
set -e
SCRIPT_DIR=$(cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
input="$SCRIPT_DIR/data/restaurants.csv"
output="$SCRIPT_DIR/data/restaurants.jsonl"
# In order:
# - Convert CSV to JSON
# - Convert JSON array in JSON lines notation
# - Remove last line (which is all `null`)
cat "$input" | jq -s --raw-input --raw-output \
'split("\n") | .[1:-1] | map(split(",")) |
map({
"id": .[0],
"name": .[1],
"city": .[2],
"location": {
"lon": .[8] | sub("^\"\\["; "") | sub("\\s*"; "") | tonumber,
"lat": .[9] | sub("\\]\"$"; "") | sub("\\s*"; "") | tonumber,
},
"averageCostForTwo": .[3],
"aggregateRating": .[4],
"ratingText": .[5],
"votes": .[6],
"date": .[7]
})' "$input" | \
jq -c '.[]' > "$output"

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,50 @@
{
"properties": {
"id": {
"type": "long"
},
"name": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"city": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"location": {
"type": "geo_point"
},
"averageCostForTwo": {
"type": "float"
},
"aggregateRating": {
"type": "float"
},
"ratingText": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"votes": {
"type": "float"
},
"date": {
"type": "date",
"format": "date_optional_time"
}
}
}

View file

@ -0,0 +1,17 @@
GET /restaurants/_search
{
"query": {
"bool": {
"must": [
{ "match": { "name": "pizza" } }
],
"should": [
{ "match": { "ratingText": "Very Good" } },
{ "match": { "ratingText": "Excellent" } }
],
"must_not": [
{ "match": { "name": "pasta" } }
]
}
}
}

View file

@ -0,0 +1,31 @@
GET /restaurants/_search
{
"from" : 0,
"size" : 5,
"sort" : [
{ "averageCostForTwo" : "desc" }
],
"query": {
"bool": {
"must": [
{
"range": {
"date": {
"gte": "2018-01-01",
"lte": "2018-12-31"
}
}
},
{
"geo_distance": {
"distance": "20km",
"location": {
"lat": 33.9259,
"lon": -83.3389
}
}
}
]
}
}
}

32
Assignment2_part1/upload.sh Executable file
View file

@ -0,0 +1,32 @@
#!/bin/bash
set -e
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
elastic_dir="$HOME/bin/elasticsearch-8.6.2"
elastic_url="https://localhost:9200"
crt="$elastic_dir/config/certs/http_ca.crt"
input="$SCRIPT_DIR/data/restaurants.jsonl"
password="GZH*wqNTvQ0WRdrPrpHm"
# Create index
curl --cacert "$crt" -u "elastic:$password" \
-X DELETE "$elastic_url/restaurants" | jq . || true
curl --cacert "$crt" -u "elastic:$password" \
-X PUT "$elastic_url/restaurants" | jq .
# Upload mappings
cat mappings.json | curl --cacert "$crt" -u "elastic:$password" -X POST \
--data-binary @- "$elastic_url/restaurants/_mappings/" \
-H "Content-Type: application/json" | jq .
# Upload documents one by one
while IFS= read -r line
do
id=$(echo "$line" | jq '.id | tonumber')
echo $line | curl -k --cacert "$crt" -u "elastic:$password" -X PUT \
--data-binary @- "$elastic_url/restaurants/_doc/$id" \
-H "Content-Type: application/json" | jq ._id &
done < "$input"

Binary file not shown.

File diff suppressed because it is too large Load diff