This repository has been archived on 2022-12-21. You can view files and clone it, but cannot push or open issues or pull requests.
ddm/hw02/Untitled.ipynb

559 lines
15 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": 31,
"id": "4abbae95",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import time\n",
"from pymongo import MongoClient"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c1d3a065",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 75,
"id": "f3196535",
"metadata": {},
"outputs": [],
"source": [
"mongo_conn_str = \"mongodb://localhost:27017\"\n",
"mongo = MongoClient(mongo_conn_str)\n",
"db = mongo[\"ddm\"]"
]
},
{
"cell_type": "code",
"execution_count": 33,
"id": "264e24b7",
"metadata": {},
"outputs": [],
"source": [
"def start_the_time():\n",
" global start_time\n",
" start_time = time.time()\n",
" \n",
"def end_the_time():\n",
" print(\"--- %s seconds ---\" % (time.time() - start_time))"
]
},
{
"cell_type": "markdown",
"id": "9becfc2a",
"metadata": {},
"source": [
"### Top 10 journals for numbers of papers"
]
},
{
"cell_type": "code",
"execution_count": 80,
"id": "8e98cd86",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"--- 0.0001361370086669922 seconds ---\n",
"['Morphometric MRI as a diagnostic biomarker of frontotemporal dementia: A systematic review to determine clinical applicability', 'Low adherence of Swiss children to national dietary guidelines', 'Decomposing broadcast algorithms using abstract MAC layers']\n"
]
}
],
"source": [
"start_the_time()\n",
"result = db[\"papers\"].find({ \n",
" \"authors.email\": {\"$regex\": \"@usi\\.ch\"}\n",
"}, {\n",
" 'title': 1\n",
"})\n",
"end_the_time()\n",
"\n",
"titles = [doc['title'] for doc in result]\n",
"print(titles)"
]
},
{
"cell_type": "markdown",
"id": "c0e9ad5e",
"metadata": {},
"source": [
"### Most 3 cited authors in 'Strategic info-mediaries'"
]
},
{
"cell_type": "code",
"execution_count": 101,
"id": "a8781d01",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"--- 0.302872896194458 seconds ---\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>_id</th>\n",
" <th>referenceCount</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Freshwater</td>\n",
" <td>12</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Biodiversity</td>\n",
" <td>9</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Marine</td>\n",
" <td>8</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Climate change</td>\n",
" <td>8</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Ecosystem-based management</td>\n",
" <td>7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>Coastal</td>\n",
" <td>6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>Eutrophication</td>\n",
" <td>5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>Phosphorus</td>\n",
" <td>5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>Policy</td>\n",
" <td>5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>Agriculture</td>\n",
" <td>4</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" _id referenceCount\n",
"0 Freshwater 12\n",
"1 Biodiversity 9\n",
"2 Marine 8\n",
"3 Climate change 8\n",
"4 Ecosystem-based management 7\n",
"5 Coastal 6\n",
"6 Eutrophication 5\n",
"7 Phosphorus 5\n",
"8 Policy 5\n",
"9 Agriculture 4"
]
},
"execution_count": 101,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pipeline = [\n",
" {\n",
" \"$match\": {\n",
" \"publicationDetails.journal\": \"Vertical e-markets\"\n",
" }\n",
" },\n",
" { \n",
" \"$unwind\": \"$authors\" \n",
" }, \n",
" { \n",
" \"$lookup\": {\n",
" \"from\": \"authors\",\n",
" \"localField\": \"authors.authorId\",\n",
" \"foreignField\": \"_id\",\n",
" \"as\": \"authors\"\n",
" }\n",
" },\n",
" {\n",
" \"$match\": {\n",
" \"authors.bio\": {\n",
" \"$regex\": \"[Ss]uccess\"\n",
" }\n",
" }\n",
" },\n",
" { \n",
" \"$unwind\": \"$keywords\" \n",
" },\n",
" { \n",
" \"$group\": {\n",
" \"_id\": \"$keywords\", \n",
" \"referenceCount\": { \n",
" \"$sum\": 1\n",
" } \n",
" } \n",
" },\n",
" {\n",
" \"$sort\": {\n",
" \"referenceCount\": -1\n",
" }\n",
" },\n",
" {\n",
" \"$limit\": 10\n",
" }\n",
"]\n",
"\n",
"start_the_time()\n",
"result = db[\"papers\"].aggregate(pipeline)\n",
"end_the_time()\n",
"\n",
"pd.DataFrame(result)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5fc8b56f",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"id": "39ae3826",
"metadata": {},
"source": [
"### Title"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "146fef1e",
"metadata": {},
"outputs": [],
"source": [
"pipeline = [\n",
" {\n",
" \"$match\": {\n",
" \"publicationDetails.journal\": \"Next-generation users\",\n",
" }\n",
" },\n",
" { \"$unwind\": \"$authors\" }, \n",
" { \n",
" \"$group\": {\n",
" \"_id\": \"$references.authors.name\", \n",
" \"referenceCount\": { \n",
" \"$sum\": 1\n",
" } \n",
" } \n",
" },\n",
" {\n",
" \"$sort\": {\n",
" \"referenceCount\": -1\n",
" }\n",
" },\n",
" {\n",
" \"$limit\": 3\n",
" }\n",
"]\n",
"\n",
"start_the_time()\n",
"result = db[\"papers\"].aggregate(pipeline)\n",
"end_the_time()\n",
"\n",
"pd.DataFrame(result)"
]
},
{
"cell_type": "code",
"execution_count": 48,
"id": "5c02ad39",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"--- 0.0010950565338134766 seconds ---\n"
]
}
],
"source": [
"new_journal = { \n",
" 'issn': '89012388',\n",
" 'name': 'Advanced Topics on Databases',\n",
" 'volumes': []\n",
"}\n",
"start_the_time()\n",
"new_journal_id = db[\"journals\"].insert_one(new_journal).inserted_id\n",
"end_the_time()"
]
},
{
"cell_type": "code",
"execution_count": 95,
"id": "2f128b04",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"--- 0.09116077423095703 seconds ---\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>_id</th>\n",
" <th>paper_number</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>One-to-one content</td>\n",
" <td>744</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Vertical e-markets</td>\n",
" <td>515</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Bricks-and-clicks web-readiness</td>\n",
" <td>483</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Plug-and-play web-readiness</td>\n",
" <td>361</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Back-end partnerships</td>\n",
" <td>354</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>Next-generation users</td>\n",
" <td>334</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>Distributed mindshare</td>\n",
" <td>329</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>Enterprise e-services</td>\n",
" <td>281</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>Strategic info-mediaries</td>\n",
" <td>276</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>Clicks-and-mortar channels</td>\n",
" <td>271</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" _id paper_number\n",
"0 One-to-one content 744\n",
"1 Vertical e-markets 515\n",
"2 Bricks-and-clicks web-readiness 483\n",
"3 Plug-and-play web-readiness 361\n",
"4 Back-end partnerships 354\n",
"5 Next-generation users 334\n",
"6 Distributed mindshare 329\n",
"7 Enterprise e-services 281\n",
"8 Strategic info-mediaries 276\n",
"9 Clicks-and-mortar channels 271"
]
},
"execution_count": 95,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pipeline = [{\n",
" \"$group\": {\n",
" \"_id\":\"$publicationDetails.journal\",\n",
" \"paper_number\":{\n",
" \"$sum\":1\n",
" }\n",
" }\n",
"},{\n",
" \"$sort\":{\n",
" \"paper_number\":-1\n",
" }\n",
"},{\n",
" \"$limit\":10\n",
"}]\n",
"\n",
"start_the_time()\n",
"result = db[\"papers\"].aggregate(pipeline)\n",
"end_the_time()\n",
"\n",
"pd.DataFrame(result)"
]
},
{
"cell_type": "code",
"execution_count": 132,
"id": "8f12712b",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"--- 1.3057661056518555 seconds ---\n"
]
},
{
"data": {
"text/plain": [
"'{\"_id\":{\"0\":{\"journal\":\"One-to-one content\",\"sectionTitle\":\"Introduction\"},\"1\":{\"journal\":\"One-to-one content\",\"sectionTitle\":\"Discussion\"},\"2\":{\"journal\":\"Vertical e-markets\",\"sectionTitle\":\"Introduction\"},\"3\":{\"journal\":\"One-to-one content\",\"sectionTitle\":\"Results\"},\"4\":{\"journal\":\"Bricks-and-clicks web-readiness\",\"sectionTitle\":\"Method details\"},\"5\":{\"journal\":\"Plug-and-play web-readiness\",\"sectionTitle\":\"Introduction\"},\"6\":{\"journal\":\"Back-end partnerships\",\"sectionTitle\":\"Introduction\"},\"7\":{\"journal\":\"Next-generation users\",\"sectionTitle\":\"Introduction\"},\"8\":{\"journal\":\"Plug-and-play web-readiness\",\"sectionTitle\":\"Discussion\"},\"9\":{\"journal\":\"Next-generation users\",\"sectionTitle\":\"Results\"}},\"sectionCount\":{\"0\":630,\"1\":512,\"2\":506,\"3\":503,\"4\":371,\"5\":353,\"6\":350,\"7\":332,\"8\":330,\"9\":322}}'"
]
},
"execution_count": 132,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pipeline = [\n",
" { \n",
" \"$unwind\": \"$content\"\n",
" }, {\n",
" \"$group\": {\n",
" \"_id\": {\n",
" \"journal\": \"$publicationDetails.journal\",\n",
" \"sectionTitle\": \"$content.title\"\n",
" }, \n",
" \"sectionCount\": {\n",
" \"$sum\": 1\n",
" }\n",
" }\n",
" }, {\n",
" \"$sort\": {\n",
" \"sectionCount\": -1\n",
" }\n",
" }, {\n",
" \"$limit\":10\n",
" }\n",
"]\n",
"\n",
"start_the_time()\n",
"result = db[\"papers\"].aggregate(pipeline)\n",
"end_the_time()\n",
"\n",
"pd.DataFrame(result).to_json()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ecfd45d9",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.8"
}
},
"nbformat": 4,
"nbformat_minor": 5
}