hw2: done ex1, ex2a, ex2b
This commit is contained in:
parent
be9046ca4c
commit
d060e66518
10 changed files with 28659 additions and 0 deletions
BIN
Assignment2_part1/Assignment2_part1.pdf
Normal file
BIN
Assignment2_part1/Assignment2_part1.pdf
Normal file
Binary file not shown.
30
Assignment2_part1/convert.sh
Executable file
30
Assignment2_part1/convert.sh
Executable file
|
@ -0,0 +1,30 @@
|
||||||
|
#!/bin/sh
|
||||||
|
set -e
|
||||||
|
|
||||||
|
SCRIPT_DIR=$(cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
|
||||||
|
|
||||||
|
input="$SCRIPT_DIR/data/restaurants.csv"
|
||||||
|
output="$SCRIPT_DIR/data/restaurants.jsonl"
|
||||||
|
|
||||||
|
|
||||||
|
# In order:
|
||||||
|
# - Convert CSV to JSON
|
||||||
|
# - Convert JSON array in JSON lines notation
|
||||||
|
# - Remove last line (which is all `null`)
|
||||||
|
cat "$input" | jq -s --raw-input --raw-output \
|
||||||
|
'split("\n") | .[1:-1] | map(split(",")) |
|
||||||
|
map({
|
||||||
|
"id": .[0],
|
||||||
|
"name": .[1],
|
||||||
|
"city": .[2],
|
||||||
|
"location": {
|
||||||
|
"lon": .[8] | sub("^\"\\["; "") | sub("\\s*"; "") | tonumber,
|
||||||
|
"lat": .[9] | sub("\\]\"$"; "") | sub("\\s*"; "") | tonumber,
|
||||||
|
},
|
||||||
|
"averageCostForTwo": .[3],
|
||||||
|
"aggregateRating": .[4],
|
||||||
|
"ratingText": .[5],
|
||||||
|
"votes": .[6],
|
||||||
|
"date": .[7]
|
||||||
|
})' "$input" | \
|
||||||
|
jq -c '.[]' > "$output"
|
9500
Assignment2_part1/data/restaurants.csv
Normal file
9500
Assignment2_part1/data/restaurants.csv
Normal file
File diff suppressed because it is too large
Load diff
9499
Assignment2_part1/data/restaurants.jsonl
Normal file
9499
Assignment2_part1/data/restaurants.jsonl
Normal file
File diff suppressed because it is too large
Load diff
50
Assignment2_part1/mappings.json
Normal file
50
Assignment2_part1/mappings.json
Normal file
|
@ -0,0 +1,50 @@
|
||||||
|
{
|
||||||
|
"properties": {
|
||||||
|
"id": {
|
||||||
|
"type": "long"
|
||||||
|
},
|
||||||
|
"name": {
|
||||||
|
"type": "text",
|
||||||
|
"fields": {
|
||||||
|
"keyword": {
|
||||||
|
"type": "keyword",
|
||||||
|
"ignore_above": 256
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"city": {
|
||||||
|
"type": "text",
|
||||||
|
"fields": {
|
||||||
|
"keyword": {
|
||||||
|
"type": "keyword",
|
||||||
|
"ignore_above": 256
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"location": {
|
||||||
|
"type": "geo_point"
|
||||||
|
},
|
||||||
|
"averageCostForTwo": {
|
||||||
|
"type": "float"
|
||||||
|
},
|
||||||
|
"aggregateRating": {
|
||||||
|
"type": "float"
|
||||||
|
},
|
||||||
|
"ratingText": {
|
||||||
|
"type": "text",
|
||||||
|
"fields": {
|
||||||
|
"keyword": {
|
||||||
|
"type": "keyword",
|
||||||
|
"ignore_above": 256
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"votes": {
|
||||||
|
"type": "float"
|
||||||
|
},
|
||||||
|
"date": {
|
||||||
|
"type": "date",
|
||||||
|
"format": "date_optional_time"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
17
Assignment2_part1/queries/query2a.http
Normal file
17
Assignment2_part1/queries/query2a.http
Normal file
|
@ -0,0 +1,17 @@
|
||||||
|
GET /restaurants/_search
|
||||||
|
{
|
||||||
|
"query": {
|
||||||
|
"bool": {
|
||||||
|
"must": [
|
||||||
|
{ "match": { "name": "pizza" } }
|
||||||
|
],
|
||||||
|
"should": [
|
||||||
|
{ "match": { "ratingText": "Very Good" } },
|
||||||
|
{ "match": { "ratingText": "Excellent" } }
|
||||||
|
],
|
||||||
|
"must_not": [
|
||||||
|
{ "match": { "name": "pasta" } }
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
31
Assignment2_part1/queries/query2b.http
Normal file
31
Assignment2_part1/queries/query2b.http
Normal file
|
@ -0,0 +1,31 @@
|
||||||
|
GET /restaurants/_search
|
||||||
|
{
|
||||||
|
"from" : 0,
|
||||||
|
"size" : 5,
|
||||||
|
"sort" : [
|
||||||
|
{ "averageCostForTwo" : "desc" }
|
||||||
|
],
|
||||||
|
"query": {
|
||||||
|
"bool": {
|
||||||
|
"must": [
|
||||||
|
{
|
||||||
|
"range": {
|
||||||
|
"date": {
|
||||||
|
"gte": "2018-01-01",
|
||||||
|
"lte": "2018-12-31"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"geo_distance": {
|
||||||
|
"distance": "20km",
|
||||||
|
"location": {
|
||||||
|
"lat": 33.9259,
|
||||||
|
"lon": -83.3389
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
32
Assignment2_part1/upload.sh
Executable file
32
Assignment2_part1/upload.sh
Executable file
|
@ -0,0 +1,32 @@
|
||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
|
||||||
|
|
||||||
|
elastic_dir="$HOME/bin/elasticsearch-8.6.2"
|
||||||
|
elastic_url="https://localhost:9200"
|
||||||
|
crt="$elastic_dir/config/certs/http_ca.crt"
|
||||||
|
|
||||||
|
input="$SCRIPT_DIR/data/restaurants.jsonl"
|
||||||
|
password="GZH*wqNTvQ0WRdrPrpHm"
|
||||||
|
|
||||||
|
# Create index
|
||||||
|
curl --cacert "$crt" -u "elastic:$password" \
|
||||||
|
-X DELETE "$elastic_url/restaurants" | jq . || true
|
||||||
|
curl --cacert "$crt" -u "elastic:$password" \
|
||||||
|
-X PUT "$elastic_url/restaurants" | jq .
|
||||||
|
|
||||||
|
# Upload mappings
|
||||||
|
cat mappings.json | curl --cacert "$crt" -u "elastic:$password" -X POST \
|
||||||
|
--data-binary @- "$elastic_url/restaurants/_mappings/" \
|
||||||
|
-H "Content-Type: application/json" | jq .
|
||||||
|
|
||||||
|
# Upload documents one by one
|
||||||
|
while IFS= read -r line
|
||||||
|
do
|
||||||
|
id=$(echo "$line" | jq '.id | tonumber')
|
||||||
|
echo $line | curl -k --cacert "$crt" -u "elastic:$password" -X PUT \
|
||||||
|
--data-binary @- "$elastic_url/restaurants/_doc/$id" \
|
||||||
|
-H "Content-Type: application/json" | jq ._id &
|
||||||
|
done < "$input"
|
BIN
Assignment2_part2/assignment2_part2.pdf
Normal file
BIN
Assignment2_part2/assignment2_part2.pdf
Normal file
Binary file not shown.
9500
Assignment2_part2/restaurants_extended.csv
Normal file
9500
Assignment2_part2/restaurants_extended.csv
Normal file
File diff suppressed because it is too large
Load diff
Reference in a new issue