Commit 20034681 authored by abrahaja's avatar abrahaja
Browse files

Search Engine

parents
#!/bin/bash
#
# index app script
#
#
#
#
#
# Stop on errors, print commands
# See https://vaneyckt.io/posts/safer_bash_scripts_with_set_euxo_pipefail/
set -Eeuo pipefail
set -x
# Sanity check command line options
usage() {
echo "Usage: $0 (start|stop|restart)"
}
if [ $# -ne 1 ]; then
usage
exit 1
fi
start() {
if lsof -Pi :8001 -sTCP:LISTEN; then
echo "Error: a process is already using port 8001"
exit 1
fi
echo "starting index server ..."
export FLASK_APP=index
export INDEX_SETTINGS=config.py
flask run --host 0.0.0.0 --port 8001 &> /dev/null &
}
stop() {
echo "stopping index server ..."
pkill -f 'flask run --host 0.0.0.0 --port 8001'
}
debug() {
if lsof -Pi :8001 -sTCP:LISTEN; then
echo "Error: a process is already using port 8001"
exit 1
fi
echo "starting index server ..."
export FLASK_APP=index
export INDEX_SETTINGS=config.py
export FLASK_DEBUG=True
flask run --host 0.0.0.0 --port 8001
}
case $1 in
"start")
start
;;
"stop")
stop
;;
"restart")
stop
start
;;
"debug")
debug
;;
*)
usage
exit 1
;;
esac
#!/bin/bash
#
# wikipedia485db
#
# Database management shell script
#
# Usage, Create, Destroy, Reset, Dump, Sample code
#
# Stop on errors, print commands
# See https://vaneyckt.io/posts/safer_bash_scripts_with_set_euxo_pipefail/
set -Eeuo pipefail
set -x
# Sanity check command line options
usage() {
echo "Usage: $0 (create|destroy|reset|dump)"
}
if [ $# -ne 1 ]; then
usage
exit 1
fi
# Parse argument. $1 is the first argument
case $1 in
"create")
DIR=search/search/var/
if [ -d "$DIR" ]; then
echo "Error: data base already exists"
exit 1
fi
mkdir -p search/search/var/
sqlite3 search/search/var/wikipedia.sqlite3 < search/search/sql/wikipedia.sql
;;
"destroy")
rm -rf search/search/var/wikipedia.sqlite3
;;
"reset")
rm -rf search/search/var/
mkdir -p search/search/var/
sqlite3 search/search/var/wikipedia.sqlite3 < search/search/sql/wikipedia.sql
;;
"dump")
sqlite3 -batch -line var/wikipedia.sqlite3 'SELECT * FROM Documents'
;;
*)
usage
exit 1
;;
esac
\ No newline at end of file
#!/bin/bash
set -Eeuo pipefail
set -x
python3 -m venv env
source env/bin/activate
mkdir -p tmp
export TMPDIR=tmp
pip install -r index/requirements.txt
pip install -e index
pip install -r search/requirements.txt
pip install -e search
pushd $VIRTUAL_ENV/bin
ln -sf ../../tests/utils/hadoop.py hadoop
popd
#!/bin/bash
#
# search app script
#
#
#
#
#
# Stop on errors, print commands
# See https://vaneyckt.io/posts/safer_bash_scripts_with_set_euxo_pipefail/
set -Eeuo pipefail
set -x
# Sanity check command line options
usage() {
echo "Usage: $0 (start|stop|restart)"
}
if [ $# -ne 1 ]; then
usage
exit 1
fi
start() {
if lsof -Pi :8000 -sTCP:LISTEN; then
echo "Error: a process is already using port 8000"
exit 1
fi
echo "starting search server ..."
export FLASK_APP=search
export SEARCH_SETTINGS=config.py
flask run --host 0.0.0.0 --port 8000 &> /dev/null &
}
stop() {
echo "stopping search server ..."
pkill -f 'flask run --host 0.0.0.0 --port 8000'
}
debug() {
if lsof -Pi :8000 -sTCP:LISTEN; then
echo "Error: a process is already using port 8000"
exit 1
fi
echo "starting search server ..."
export FLASK_APP=search
export SEARCH_SETTINGS=config.py
export FLASK_DEBUG=True
flask run --host 0.0.0.0 --port 8000
}
case $1 in
"start")
start
;;
"stop")
stop
;;
"restart")
stop
start
;;
"debug")
debug
;;
*)
usage
exit 1
;;
esac
#!/bin/bash
./pipeline.sh
set -x
sort inverted_index.txt > sorted_inverted_index.txt
sort correct_small_output.txt > sorted_correct_small_output.txt
diff sorted_correct_small_output.txt sorted_inverted_index.txt | cat
\ No newline at end of file
character 0.47712125471966244 2 1 1.593512841936855
maintenance 0.47712125471966244 2 1 1.593512841936855
mike 0.47712125471966244 1 1 1.138223458526325
kurt 0.47712125471966244 2 1 1.593512841936855
peter 0.47712125471966244 3 1 2.048802225347385
flaw 0.47712125471966244 2 1 1.593512841936855
heard 0.47712125471966244 3 1 2.048802225347385
cool 0.47712125471966244 1 1 1.138223458526325
remembering 0.47712125471966244 3 1 2.048802225347385
laurence 0.47712125471966244 3 1 2.048802225347385
d3js 0.47712125471966244 1 1 1.138223458526325
made 0.47712125471966244 1 1 1.138223458526325
build 0.47712125471966244 2 1 1.593512841936855
document 0.0 2 1 1.593512841936855 3 1 2.048802225347385 1 2 1.138223458526325
originality 0.47712125471966244 3 1 2.048802225347385
bostock 0.47712125471966244 1 1 1.138223458526325
forgetting 0.47712125471966244 3 1 2.048802225347385
hear 0.47712125471966244 3 1 2.048802225347385
art 0.47712125471966244 3 1 2.048802225347385
human 0.47712125471966244 2 1 1.593512841936855
fine 0.47712125471966244 3 1 2.048802225347385
vonnegut 0.47712125471966244 2 1 1.593512841936855
\ No newline at end of file
This diff is collapsed.
art 0.47712125471966244 3 1 2.048802225347385
cool 0.47712125471966244 1 1 1.138223458526325
flaw 0.47712125471966244 2 1 1.593512841936855
human 0.47712125471966244 2 1 1.593512841936855
maintenance 0.47712125471966244 2 1 1.593512841936855
remembering 0.47712125471966244 3 1 2.048802225347385
bostock 0.47712125471966244 1 1 1.138223458526325
d3js 0.47712125471966244 1 1 1.138223458526325
forgetting 0.47712125471966244 3 1 2.048802225347385
kurt 0.47712125471966244 2 1 1.593512841936855
mike 0.47712125471966244 1 1 1.138223458526325
vonnegut 0.47712125471966244 2 1 1.593512841936855
build 0.47712125471966244 2 1 1.593512841936855
document 0.0 1 2 1.138223458526325 2 1 1.593512841936855 3 1 2.048802225347385
hear 0.47712125471966244 3 1 2.048802225347385
laurence 0.47712125471966244 3 1 2.048802225347385
originality 0.47712125471966244 3 1 2.048802225347385
character 0.47712125471966244 2 1 1.593512841936855
fine 0.47712125471966244 3 1 2.048802225347385
heard 0.47712125471966244 3 1 2.048802225347385
made 0.47712125471966244 1 1 1.138223458526325
peter 0.47712125471966244 3 1 2.048802225347385
#!/usr/bin/env python3
"""Map 0."""
import csv
import re
import sys
count = 0
for line in sys.stdin:
csv_in = csv.reader(line, delimiter=',')
count += 1
print("common_key 1")
\ No newline at end of file
#!/usr/bin/env python3
"""Map 1."""
import csv
import re
import sys
import pathlib
import os
def build_stop_words():
stop_words_dict = {}
with open("stopwords.txt", 'r') as fin:
for word in fin:
word = word.strip()
stop_words_dict[word] = True
return stop_words_dict
def num_occurrences_each_term(doc_body, stop_words_dict):
"""Compute number of occurences of each term in doc."""
words = doc_body.split()
term_freq = {}
for word in words:
word = re.sub(r'[^a-zA-Z0-9]+', '', word)
word = word.lower()
if not word in stop_words_dict and word != "":
# print("word: ", word)
if not word in term_freq:
term_freq[word] = 1
else:
term_freq[word] += 1
return term_freq
def update_global_term_freq(glob, loc):
"""Updates the number of documents that each word occurs in."""
for key in loc.keys():
if not key in glob:
glob[key] = 1
else:
glob[key] += 1
return glob
stop_words_dict = build_stop_words()
global_term_freq = {}
num_docs = 0
local_freqs_by_doc_id = {}
separator = ","
for line in sys.stdin: # for each document
fields = line.split(separator, 2)
doc_id = fields[0].strip("\"")
doc_name = fields[1].strip("\"")
doc_body = fields[2].strip("\"\n")
doc_body = doc_name + " " + doc_body
'''
print("doc_id: ", doc_id)
print("doc_name: ", doc_name)
print("doc_body: ", doc_body)
'''
local_term_freq = num_occurrences_each_term(doc_body, stop_words_dict)
# removed cause it was overcounting
# global_term_freq = update_global_term_freq(global_term_freq, local_term_freq)
local_freqs_by_doc_id[doc_id] = local_term_freq
for doc_id in local_freqs_by_doc_id.keys():
local_term_freq = local_freqs_by_doc_id[doc_id]
for term in local_term_freq.keys():
tf_k = local_term_freq[term]
# partial_n_k = global_term_freq[term]
print(term + '\t' + doc_id + '\t' + str(tf_k))
#!/usr/bin/env python3
"""Map 2."""
import sys
import math
with open("total_document_count.txt") as fin:
N = float(next(fin))
idfs = {}
for line in sys.stdin:
fields = line.split()
term = fields[0]
doc_id = fields[1]
fields.remove(doc_id)
fields.insert(0, doc_id)
n_k = float(fields[3])
if term not in idfs:
# print("N: ", N, "n_k", n_k)
idf = math.log10(N/n_k)
idfs[term] = idf
else:
idf = idfs[term]
# print("idf: ", idf)
fields.insert(1, str(idf))
out_str = '\t'.join(fields)
print(out_str)
#!/usr/bin/env python3
"""Map 3."""
import sys
for line in sys.stdin:
line = line.strip('\n')
fields = line.split()
doc_id = fields[0]
idf = fields[1]
term = fields[2]
norm_factor_sq = fields[3]
num_occur_in_doc_i = fields[4]
str_out = term + '\t' + idf + '\t' + doc_id + '\t' + norm_factor_sq + '\t' + num_occur_in_doc_i
print(str_out)
\ No newline at end of file
#!/bin/bash
#
# Example of how to chain mapreduce jobs together. The output of one
# job is the input to the next.
#
# Hadoop options
# jar index/hadoop/hadoop-streaming-2.7.2.jar # Hadoop configuration
# -input <directory> # Input directory
# -output <directory> # Output directory
# -mapper <exec_name> # Mapper executable
# -reducer <exec_name> # Reducer executable
# Stop on errors
# See https://vaneyckt.io/posts/safer_bash_scripts_with_set_euxo_pipefail/
set -Eeuo pipefail
set -x
# Remove first output directory, if it exists
rm -rf output1
# Run first MapReduce job
hadoop \
jar ../hadoop-streaming-2.7.2.jar \
-input sample_input \
-output output1 \
-mapper ./map0.py \
-reducer ./reduce0.py \
# Remove second output directory, if it exists
rm -rf output2
# Run second MapReduce job
hadoop \
jar hadoop-streaming-2.7.2.jar \
-input sample_input \
-output output2 \
-mapper ./map1.py \
-reducer ./reduce1.py
# Remove third output directory, if it exists
rm -rf output3
# Run third MapReduce job
hadoop \
jar hadoop-streaming-2.7.2.jar \
-input output2 \
-output output3 \
-mapper ./map2.py \
-reducer ./reduce2.py
# Remove fourth output directory, if it exists
rm -rf output4
# Run fourth MapReduce job
hadoop \
jar hadoop-streaming-2.7.2.jar \
-input output3 \
-output output4 \
-mapper ./map3.py \
-reducer ./reduce3.py
cat output4/part* > inverted_index.txt
\ No newline at end of file
#!/usr/bin/env python3
"""Reduce 0."""
import sys
import pathlib
import os
import csv
document_count = 0
for line in sys.stdin:
document_count += 1
with open("total_document_count.txt", "w+") as count_doc:
count_doc.write(str(document_count)+'\n')
\ No newline at end of file
#!/usr/bin/env python3
"""Reduce 1."""
import sys
import pathlib
import os
terms = []
n_k = {}
for line in sys.stdin:
line = line.strip("\n")
words = line.split('\t')
term = words[0].strip("")
doc_id = words[1]
tf = words[2]
if term not in n_k:
n_k[term] = 0
# print(term, n_k[term], partial_n_k)
n_k[term] += 1
# print(term, n_k[term])
entry = {
"term": term,
"doc_id": doc_id,
"tf": tf
}
terms.append(entry)
for t in terms:
out_str = "\t".join([t["term"], t["doc_id"], t["tf"], str(n_k[t["term"]])])
print(out_str)
#!/usr/bin/env python3
"""Reduce 2."""
import sys
norm_factor_sums = {}
fields_to_output = []
for line in sys.stdin:
fields = line.split()
fields_to_output.append(fields)
tf = float(fields[3])
doc_id = fields[0]
idf = float(fields[1])
to_sum = (tf**2) * (idf**2)
if not doc_id in norm_factor_sums:
norm_factor_sums[doc_id] = 0
# print("doc_id: ", doc_id, "term: ", fields[2], "term freq: ", tf, "idf of term: ", idf)
# print("to_sum: ", to_sum)
norm_factor_sums[doc_id] += to_sum
# print("norm_factor_sum: ", norm_factor_sums[doc_id])
for fields in fields_to_output:
doc_id = fields[0]
norm_factor_sq = norm_factor_sums[doc_id]
fields.insert(3, str(norm_factor_sq))
out_str = '\t'.join(fields)
print(out_str)
#!/usr/bin/env python3
"""Reduce 3."""
import sys
terms = {}
for line in sys.stdin:
fields = line.split()
term = fields[0]
idf = fields[1]
doc_id = fields[2]
norm_factor_sq = fields[3]
num_occur_in_doc = fields[4]
doc_i = '\t' + doc_id + '\t' + num_occur_in_doc + '\t' + norm_factor_sq
# print(doc_i)
if not term in terms:
#print("term: ", term, "idf: ", idf)
terms[term] = term + '\t' + idf
terms[term] += doc_i
#print(terms[term])
for key in terms.keys():
out_str = terms[key]
print(out_str)
\ No newline at end of file
"1","The Document: A","This document is about Mike Bostock. He made d3.js and he's really cool"
"2","The Document: B","Another flaw in the human character is that everybody wants to build and nobody wants to do maintenance. - Kurt Vonnegut"
"3","Document C:","Originality is the fine art of remembering what you hear but forgetting where you heard it. - Laurence Peter"
\ No newline at end of file
This source diff could not be displayed because it is too large. You can view the blob instead.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment