Boolean string to elasticsearch query OSS module

.everyoneloves__top-leaderboard:empty,.everyoneloves__mid-leaderboard:empty margin-bottom:0;

up vote
6
down vote

favorite

I am putting together a little module for oss release that will let you parse a boolean expression consisting of and/AND/or/OR's (no brackets yet) and output a complete elasticsearch query.

Boolean expression logic:

Right now it uses OR as the basis and puts everything on top of that as ANDs. This means that AND binds left to right.

I lack input on:

The quality of the output elasticsearch query - can it be simplified? Are there better approaches?

The way I interpret the boolean expression.

def string_to_query(s):
 s = s.lower()
 tokens = [' '.join(t.split()) for t in s.split('or')]
 or_terms = 
 while tokens:
 leaf = tokens.pop()

 and_terms = leaf.split('and')
 if len(and_terms) < 2:
 term = and_terms[0]
 or_terms.extend([
 "match": "Review.Text": 
 "query": term, "operator": "and",
 "match": "Review.Title": 
 "query": term, "operator": "and"
 ])
 else:
 filters = [
 "bool": 
 "should": ["match": "Review.Text": 
 "query": term, "operator": "and",
 "match": "Review.Title": 
 "query": term, "operator": "and"]

 for term in and_terms]
 or_terms.append(
 "bool": 
 "must": filters
 )

 return "query":
 "bool": 
 "should": or_terms
 


query = string_to_query(
 'dog and dog food or cat and cat food'
)

assert query == 
"query": 
 "bool": 
 "should": [
 
 "bool": 
 "must": [
 
 "bool": 
 "should": [
 
 "match": 
 "Review.Text": 
 "operator": "and",
 "query": "cat "
 
 
 ,
 
 "match": 
 "Review.Title": 
 "operator": "and",
 "query": "cat "
 
 
 
 ]
 
 ,
 
 "bool": 
 "should": [
 
 "match": 
 "Review.Text": 
 "operator": "and",
 "query": " cat food"
 
 
 ,
 
 "match": 
 "Review.Title": 
 "operator": "and",
 "query": " cat food"
 
 
 
 ]
 
 
 ]
 
 ,
 
 "bool": 
 "must": [
 
 "bool": 
 "should": [
 
 "match": 
 "Review.Text": 
 "operator": "and",
 "query": "dog "
 
 
 ,
 
 "match": 
 "Review.Title": 
 "operator": "and",
 "query": "dog "
 
 
 
 ]
 
 ,
 
 "bool": 
 "should": [
 
 "match": 
 "Review.Text": 
 "operator": "and",
 "query": " dog food"
 
 
 ,
 
 "match": 
 "Review.Title": 
 "operator": "and",
 "query": " dog food"
 
 
 
 ]
 
 
 ]
 
 
 ]

edited Mar 22 at 8:39

Mathias Ettinger

21.9k32876

asked Mar 21 at 15:20

Johannes valbjÃ¸rn

435

Does this currently work as intended?
â€“Â Mast
Mar 21 at 15:42

1

Yeah i would say so.
â€“Â Johannes valbjÃ¸rn
Mar 21 at 16:42

3

Hmm, "please correct me on any stack overflow newbie errors" makes me seriously doubt your code works as you intend.
â€“Â Peilonrayz
Mar 21 at 17:01

1

It was a comment made for any errors mde in etiquette on this forum. I am positive my code works as intended. See the assertion in the code block where i show what the output is. Copy paste in a python interpreter and voila .
â€“Â Johannes valbjÃ¸rn
Mar 21 at 17:28

My question is 1. Whether the output elasticsearch query is overly complex and could be simplified? And 2. Whether the output reflected a correct and intuitive understanding of the boolean string input
â€“Â Johannes valbjÃ¸rn
Mar 21 at 17:32

add a commentÂ |Â

up vote
6
down vote

favorite

I am putting together a little module for oss release that will let you parse a boolean expression consisting of and/AND/or/OR's (no brackets yet) and output a complete elasticsearch query.

Boolean expression logic:

Right now it uses OR as the basis and puts everything on top of that as ANDs. This means that AND binds left to right.

I lack input on:

The quality of the output elasticsearch query - can it be simplified? Are there better approaches?

The way I interpret the boolean expression.

def string_to_query(s):
 s = s.lower()
 tokens = [' '.join(t.split()) for t in s.split('or')]
 or_terms = 
 while tokens:
 leaf = tokens.pop()

 and_terms = leaf.split('and')
 if len(and_terms) < 2:
 term = and_terms[0]
 or_terms.extend([
 "match": "Review.Text": 
 "query": term, "operator": "and",
 "match": "Review.Title": 
 "query": term, "operator": "and"
 ])
 else:
 filters = [
 "bool": 
 "should": ["match": "Review.Text": 
 "query": term, "operator": "and",
 "match": "Review.Title": 
 "query": term, "operator": "and"]

 for term in and_terms]
 or_terms.append(
 "bool": 
 "must": filters
 )

 return "query":
 "bool": 
 "should": or_terms
 


query = string_to_query(
 'dog and dog food or cat and cat food'
)

assert query == 
"query": 
 "bool": 
 "should": [
 
 "bool": 
 "must": [
 
 "bool": 
 "should": [
 
 "match": 
 "Review.Text": 
 "operator": "and",
 "query": "cat "
 
 
 ,
 
 "match": 
 "Review.Title": 
 "operator": "and",
 "query": "cat "
 
 
 
 ]
 
 ,
 
 "bool": 
 "should": [
 
 "match": 
 "Review.Text": 
 "operator": "and",
 "query": " cat food"
 
 
 ,
 
 "match": 
 "Review.Title": 
 "operator": "and",
 "query": " cat food"
 
 
 
 ]
 
 
 ]
 
 ,
 
 "bool": 
 "must": [
 
 "bool": 
 "should": [
 
 "match": 
 "Review.Text": 
 "operator": "and",
 "query": "dog "
 
 
 ,
 
 "match": 
 "Review.Title": 
 "operator": "and",
 "query": "dog "
 
 
 
 ]
 
 ,
 
 "bool": 
 "should": [
 
 "match": 
 "Review.Text": 
 "operator": "and",
 "query": " dog food"
 
 
 ,
 
 "match": 
 "Review.Title": 
 "operator": "and",
 "query": " dog food"
 
 
 
 ]
 
 
 ]
 
 
 ]

edited Mar 22 at 8:39

Mathias Ettinger

21.9k32876

asked Mar 21 at 15:20

Johannes valbjÃ¸rn

435

Does this currently work as intended?
â€“Â Mast
Mar 21 at 15:42

1

Yeah i would say so.
â€“Â Johannes valbjÃ¸rn
Mar 21 at 16:42

3

Hmm, "please correct me on any stack overflow newbie errors" makes me seriously doubt your code works as you intend.
â€“Â Peilonrayz
Mar 21 at 17:01

1

It was a comment made for any errors mde in etiquette on this forum. I am positive my code works as intended. See the assertion in the code block where i show what the output is. Copy paste in a python interpreter and voila .
â€“Â Johannes valbjÃ¸rn
Mar 21 at 17:28

My question is 1. Whether the output elasticsearch query is overly complex and could be simplified? And 2. Whether the output reflected a correct and intuitive understanding of the boolean string input
â€“Â Johannes valbjÃ¸rn
Mar 21 at 17:32

add a commentÂ |Â

up vote
6
down vote

favorite

I am putting together a little module for oss release that will let you parse a boolean expression consisting of and/AND/or/OR's (no brackets yet) and output a complete elasticsearch query.

Boolean expression logic:

Right now it uses OR as the basis and puts everything on top of that as ANDs. This means that AND binds left to right.

I lack input on:

The quality of the output elasticsearch query - can it be simplified? Are there better approaches?

The way I interpret the boolean expression.

def string_to_query(s):
 s = s.lower()
 tokens = [' '.join(t.split()) for t in s.split('or')]
 or_terms = 
 while tokens:
 leaf = tokens.pop()

 and_terms = leaf.split('and')
 if len(and_terms) < 2:
 term = and_terms[0]
 or_terms.extend([
 "match": "Review.Text": 
 "query": term, "operator": "and",
 "match": "Review.Title": 
 "query": term, "operator": "and"
 ])
 else:
 filters = [
 "bool": 
 "should": ["match": "Review.Text": 
 "query": term, "operator": "and",
 "match": "Review.Title": 
 "query": term, "operator": "and"]

 for term in and_terms]
 or_terms.append(
 "bool": 
 "must": filters
 )

 return "query":
 "bool": 
 "should": or_terms
 


query = string_to_query(
 'dog and dog food or cat and cat food'
)

assert query == 
"query": 
 "bool": 
 "should": [
 
 "bool": 
 "must": [
 
 "bool": 
 "should": [
 
 "match": 
 "Review.Text": 
 "operator": "and",
 "query": "cat "
 
 
 ,
 
 "match": 
 "Review.Title": 
 "operator": "and",
 "query": "cat "
 
 
 
 ]
 
 ,
 
 "bool": 
 "should": [
 
 "match": 
 "Review.Text": 
 "operator": "and",
 "query": " cat food"
 
 
 ,
 
 "match": 
 "Review.Title": 
 "operator": "and",
 "query": " cat food"
 
 
 
 ]
 
 
 ]
 
 ,
 
 "bool": 
 "must": [
 
 "bool": 
 "should": [
 
 "match": 
 "Review.Text": 
 "operator": "and",
 "query": "dog "
 
 
 ,
 
 "match": 
 "Review.Title": 
 "operator": "and",
 "query": "dog "
 
 
 
 ]
 
 ,
 
 "bool": 
 "should": [
 
 "match": 
 "Review.Text": 
 "operator": "and",
 "query": " dog food"
 
 
 ,
 
 "match": 
 "Review.Title": 
 "operator": "and",
 "query": " dog food"
 
 
 
 ]
 
 
 ]
 
 
 ]

edited Mar 22 at 8:39

Mathias Ettinger

21.9k32876

asked Mar 21 at 15:20

Johannes valbjÃ¸rn

435

I am putting together a little module for oss release that will let you parse a boolean expression consisting of and/AND/or/OR's (no brackets yet) and output a complete elasticsearch query.

Boolean expression logic:

Right now it uses OR as the basis and puts everything on top of that as ANDs. This means that AND binds left to right.

I lack input on:

The quality of the output elasticsearch query - can it be simplified? Are there better approaches?

The way I interpret the boolean expression.

def string_to_query(s):
 s = s.lower()
 tokens = [' '.join(t.split()) for t in s.split('or')]
 or_terms = 
 while tokens:
 leaf = tokens.pop()

 and_terms = leaf.split('and')
 if len(and_terms) < 2:
 term = and_terms[0]
 or_terms.extend([
 "match": "Review.Text": 
 "query": term, "operator": "and",
 "match": "Review.Title": 
 "query": term, "operator": "and"
 ])
 else:
 filters = [
 "bool": 
 "should": ["match": "Review.Text": 
 "query": term, "operator": "and",
 "match": "Review.Title": 
 "query": term, "operator": "and"]

 for term in and_terms]
 or_terms.append(
 "bool": 
 "must": filters
 )

 return "query":
 "bool": 
 "should": or_terms
 


query = string_to_query(
 'dog and dog food or cat and cat food'
)

assert query == 
"query": 
 "bool": 
 "should": [
 
 "bool": 
 "must": [
 
 "bool": 
 "should": [
 
 "match": 
 "Review.Text": 
 "operator": "and",
 "query": "cat "
 
 
 ,
 
 "match": 
 "Review.Title": 
 "operator": "and",
 "query": "cat "
 
 
 
 ]
 
 ,
 
 "bool": 
 "should": [
 
 "match": 
 "Review.Text": 
 "operator": "and",
 "query": " cat food"
 
 
 ,
 
 "match": 
 "Review.Title": 
 "operator": "and",
 "query": " cat food"
 
 
 
 ]
 
 
 ]
 
 ,
 
 "bool": 
 "must": [
 
 "bool": 
 "should": [
 
 "match": 
 "Review.Text": 
 "operator": "and",
 "query": "dog "
 
 
 ,
 
 "match": 
 "Review.Title": 
 "operator": "and",
 "query": "dog "
 
 
 
 ]
 
 ,
 
 "bool": 
 "should": [
 
 "match": 
 "Review.Text": 
 "operator": "and",
 "query": " dog food"
 
 
 ,
 
 "match": 
 "Review.Title": 
 "operator": "and",
 "query": " dog food"
 
 
 
 ]
 
 
 ]
 
 
 ]

edited Mar 22 at 8:39

Mathias Ettinger

21.9k32876

asked Mar 21 at 15:20

Johannes valbjÃ¸rn

435

edited Mar 22 at 8:39

Mathias Ettinger

21.9k32876

edited Mar 22 at 8:39

Mathias Ettinger

21.9k32876

edited Mar 22 at 8:39

Mathias Ettinger

21.9k32876

asked Mar 21 at 15:20

Johannes valbjÃ¸rn

435

asked Mar 21 at 15:20

Johannes valbjÃ¸rn

435

asked Mar 21 at 15:20

Johannes valbjÃ¸rn

435

Does this currently work as intended?
â€“Â Mast
Mar 21 at 15:42

1

Yeah i would say so.
â€“Â Johannes valbjÃ¸rn
Mar 21 at 16:42

3

Hmm, "please correct me on any stack overflow newbie errors" makes me seriously doubt your code works as you intend.
â€“Â Peilonrayz
Mar 21 at 17:01

1

It was a comment made for any errors mde in etiquette on this forum. I am positive my code works as intended. See the assertion in the code block where i show what the output is. Copy paste in a python interpreter and voila .
â€“Â Johannes valbjÃ¸rn
Mar 21 at 17:28

My question is 1. Whether the output elasticsearch query is overly complex and could be simplified? And 2. Whether the output reflected a correct and intuitive understanding of the boolean string input
â€“Â Johannes valbjÃ¸rn
Mar 21 at 17:32

add a commentÂ |Â

Does this currently work as intended?
â€“Â Mast
Mar 21 at 15:42

1

Yeah i would say so.
â€“Â Johannes valbjÃ¸rn
Mar 21 at 16:42

3

Hmm, "please correct me on any stack overflow newbie errors" makes me seriously doubt your code works as you intend.
â€“Â Peilonrayz
Mar 21 at 17:01

1

It was a comment made for any errors mde in etiquette on this forum. I am positive my code works as intended. See the assertion in the code block where i show what the output is. Copy paste in a python interpreter and voila .
â€“Â Johannes valbjÃ¸rn
Mar 21 at 17:28

My question is 1. Whether the output elasticsearch query is overly complex and could be simplified? And 2. Whether the output reflected a correct and intuitive understanding of the boolean string input
â€“Â Johannes valbjÃ¸rn
Mar 21 at 17:32

Does this currently work as intended?
â€“Â Mast
Mar 21 at 15:42

Yeah i would say so.
â€“Â Johannes valbjÃ¸rn
Mar 21 at 16:42

Hmm, "please correct me on any stack overflow newbie errors" makes me seriously doubt your code works as you intend.
â€“Â Peilonrayz
Mar 21 at 17:01

It was a comment made for any errors mde in etiquette on this forum. I am positive my code works as intended. See the assertion in the code block where i show what the output is. Copy paste in a python interpreter and voila .
â€“Â Johannes valbjÃ¸rn
Mar 21 at 17:28

My question is 1. Whether the output elasticsearch query is overly complex and could be simplified? And 2. Whether the output reflected a correct and intuitive understanding of the boolean string input
â€“Â Johannes valbjÃ¸rn
Mar 21 at 17:32

add a commentÂ |Â

2 Answers
2

active

oldest

votes

up vote
4
down vote

accepted

Your usage of split makes your function rather fragile:

>>> string_to_query('doctor and heart')
'query': 'bool': 'should': ['bool': 'must': ['bool': 'should': ['match': 'Review.Text': 'operator': 'and',
 'query': '',
 'match': 'Review.Title': 'operator': 'and',
 'query': ''],
 'bool': 'should': ['match': 'Review.Text': 'operator': 'and',
 'query': ' '
 'heart',
 'match': 'Review.Title': 'operator': 'and',
 'query': ' '
 'heart']],
 'match': 'Review.Text': 'operator': 'and',
 'query': 'doct',
 'match': 'Review.Title': 'operator': 'and',
 'query': 'doct']

Which is equivalent to: "(the empty string AND heart) OR doct" rather than "doctor AND heart".

An other use-case to consider is the use of "and" or "or" as words to search for rather than operators (as "Tom and Jerry", I don't want to search for documents containing "Tom" and "Jerry" separately, but for documents containing the phrase "Tom and Jerry").

Usually, for these kind of problems, an intermediate representation produced by an ad-hoc parser is way better and simpler to convert to the end result. Here I suggest producing a list of lists, since you don't (yet) consider priorisation of clauses using parenthesis. Thus:

[
 [A, B, C],
 [D, E],
 [F],
]

Would be equivalent to "(A and B and C) or (D and E) or F". Which can then easily be converted to the elasticsearch query DSL using simple list comprehensions. The catch, however is that each clause can be complete sentences and must apply to two fields: "Review.Text" and "Review.Title". This is where the multi-match query can simplify the whole writing: each clause A, B, C, D, E, and F would be converted to

'multi_match': 
 'query': clause,
 'type': 'phrase',
 'fields': ['Review.Text', 'Review.Title'],

With all the advantages of the multi-match query such as giving more weight to a single field.

The following rewrite extend the supported syntax to allow double quotes to mean "perfect match":

import re


class ClauseParser:
 def __init__(self, tokenizer, *operators):
 self._tokenizer = tokenizer
 self._operators = set(operators)
 self._found_operator = None

 def __iter__(self):
 for token in self._tokenizer:
 token_value = token.group(0)
 if token.group(2) in self._operators:
 self._found_operator = token_value
 return
 yield token_value

 @property
 def operator(self):
 found_operator = self._found_operator
 self._found_operator = None
 return found_operator


def parser(tokenizer):
 clause_parser = ClauseParser(tokenizer, 'and', 'or')
 current_group = 
 while True:
 current_group.append(' '.join(clause_parser))
 found_operator = clause_parser.operator
 if found_operator != 'and':
 yield current_group
 if found_operator is None:
 return
 current_group = 


def convert_and_clauses(clauses):
 return [
 'multi_match': 
 'query': clause,
 'type': 'phrase',
 'fields': ['Review.Text', 'Review.Title'],
 for clause in clauses
 ]


def string_to_query(phrase):
 tokenizer = re.finditer(r'"([^"]+)"|(w+)', phrase)
 query = list(parser(tokenizer))

 or_clauses = 'bool': 'should': [
 'bool': 'must': convert_and_clauses(clauses)
 for clauses in query
 ]

 return 'query': or_clauses

Example usage:

>>> string_to_query('doctor and heart')
'query': 'bool': 'should': ['bool': 'must': ['multi_match': 'fields': ['Review.Text',
 'Review.Title'],
 'query': 'doctor',
 'type': 'phrase',
 'multi_match': 'fields': ['Review.Text',
 'Review.Title'],
 'query': 'heart',
 'type': 'phrase']]
>>> string_to_query('"Tom and Jerry" or "Road runner and vil coyote"')
'query': 'bool': 'should': ['bool': 'must': ['multi_match': 'fields': ['Review.Text',
 'Review.Title'],
 'query': '"Tom '
 'and '
 'Jerry"',
 'type': 'phrase'],
 'bool': 'must': ['multi_match': 'fields': ['Review.Text',
 'Review.Title'],
 'query': '"Road '
 'runner '
 'and '
 'vil '
 'coyote"',
 'type': 'phrase']]
>>> string_to_query('cat and cat food or dog and dog food')
'query': 'bool': 'should': ['bool': 'must': ['multi_match': 'fields': ['Review.Text',
 'Review.Title'],
 'query': 'cat',
 'type': 'phrase',
 'multi_match': 'fields': ['Review.Text',
 'Review.Title'],
 'query': 'cat '
 'food',
 'type': 'phrase'],
 'bool': 'must': ['multi_match': 'fields': ['Review.Text',
 'Review.Title'],
 'query': 'dog',
 'type': 'phrase',
 'multi_match': 'fields': ['Review.Text',
 'Review.Title'],
 'query': 'dog '
 'food',
 'type': 'phrase']]

answered Mar 22 at 10:01

Mathias Ettinger

21.9k32876

This is exactly what we needed!!!! Your solution is much appreciated! Would you be ok with releasing this code in a generalised way (fields should be given as args) under a MIT license to pypi ? I could take care of packaging, readme, tests and upload to pypi / github and list you as author
â€“Â Johannes valbjÃ¸rn
Mar 22 at 10:47

@JohannesvalbjÃ¸rn Sure. What IÃ¢Â€Â™ve made in the past is to create a pull-request so my name is associated exactly to the work provided and not necessarily to the whole project.
â€“Â Mathias Ettinger
Mar 22 at 10:55

good idea!!! i've created a public repo, youre more than welcome to create a PR to it : github.com/trustpilot/python-stringtoesquery
â€“Â Johannes valbjÃ¸rn
Mar 22 at 11:03

add a commentÂ |Â

up vote
1
down vote

I have come up with another solution that yields much simpler results.
It uses query_string searching and the builtin boolean expressions over a set of defined fields:

def string_to_query(s):
 s = s.lower()
 tokens = [' '.join(t.split()) for t in s.split('or')]
 or_terms = 

 while tokens:
 leaf = tokens.pop()

 and_terms = leaf.split('and')
 if len(and_terms) < 2:
 term = and_terms[0]
 or_terms.append('""'.format(term.strip()))
 else:
 and_terms = ['""'.format(term.strip()) for term in and_terms]
 and_string = "( " + " AND ".join(and_terms) + " )"
 or_terms.append(and_string)

 query_string = " OR ".join(or_terms)
 return 
 "query": 
 "query_string": 
 "fields": ["Review.Title", "Review.Text"],
 "query": query_string
 
 
 

query = string_to_query(
 'dog and dog food or cat and cat food'
)

assert query == 
"query": 
 "query_string": 
 "fields": [
 "Review.Title",
 "Review.Text"
 ],
 "query": "( "cat" AND "cat food" ) OR ( "dog" AND "dog food" )"

answered Mar 22 at 7:29

Johannes valbjÃ¸rn

435

add a commentÂ |Â

Your Answer

StackExchange.ifUsing("editor", function ()
return StackExchange.using("mathjaxEditing", function ()
StackExchange.MarkdownEditor.creationCallbacks.add(function (editor, postfix)
StackExchange.mathjaxEditing.prepareWmdForMathJax(editor, postfix, [["\$", "\$"]]);
);
);
, "mathjax-editing");

StackExchange.ifUsing("editor", function ()
StackExchange.using("externalEditor", function ()
StackExchange.using("snippets", function ()
StackExchange.snippets.init();
);
);
, "code-snippets");

StackExchange.ready(function()
var channelOptions =
tags: "".split(" "),
id: "196"
;
initTagRenderer("".split(" "), "".split(" "), channelOptions);

StackExchange.using("externalEditor", function()
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled)
StackExchange.using("snippets", function()
createEditor();
);

else
createEditor();

);

function createEditor()
StackExchange.prepareEditor(
heartbeatType: 'answer',
convertImagesToLinks: false,
noModals: false,
showLowRepImageUploadWarning: true,
reputationToPostImages: null,
bindNavPrevention: true,
postfix: "",
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
);

);

draft saved

draft discarded

StackExchange.ready(
function ()
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f190128%2fboolean-string-to-elasticsearch-query-oss-module%23new-answer', 'question_page');

);

Post as a guest

Name

2 Answers
2

active

oldest

votes

2 Answers
2

active

oldest

votes

up vote
4
down vote

accepted

Your usage of split makes your function rather fragile:

>>> string_to_query('doctor and heart')
'query': 'bool': 'should': ['bool': 'must': ['bool': 'should': ['match': 'Review.Text': 'operator': 'and',
 'query': '',
 'match': 'Review.Title': 'operator': 'and',
 'query': ''],
 'bool': 'should': ['match': 'Review.Text': 'operator': 'and',
 'query': ' '
 'heart',
 'match': 'Review.Title': 'operator': 'and',
 'query': ' '
 'heart']],
 'match': 'Review.Text': 'operator': 'and',
 'query': 'doct',
 'match': 'Review.Title': 'operator': 'and',
 'query': 'doct']

Which is equivalent to: "(the empty string AND heart) OR doct" rather than "doctor AND heart".

[
 [A, B, C],
 [D, E],
 [F],
]

'multi_match': 
 'query': clause,
 'type': 'phrase',
 'fields': ['Review.Text', 'Review.Title'],

With all the advantages of the multi-match query such as giving more weight to a single field.

The following rewrite extend the supported syntax to allow double quotes to mean "perfect match":

import re


class ClauseParser:
 def __init__(self, tokenizer, *operators):
 self._tokenizer = tokenizer
 self._operators = set(operators)
 self._found_operator = None

 def __iter__(self):
 for token in self._tokenizer:
 token_value = token.group(0)
 if token.group(2) in self._operators:
 self._found_operator = token_value
 return
 yield token_value

 @property
 def operator(self):
 found_operator = self._found_operator
 self._found_operator = None
 return found_operator


def parser(tokenizer):
 clause_parser = ClauseParser(tokenizer, 'and', 'or')
 current_group = 
 while True:
 current_group.append(' '.join(clause_parser))
 found_operator = clause_parser.operator
 if found_operator != 'and':
 yield current_group
 if found_operator is None:
 return
 current_group = 


def convert_and_clauses(clauses):
 return [
 'multi_match': 
 'query': clause,
 'type': 'phrase',
 'fields': ['Review.Text', 'Review.Title'],
 for clause in clauses
 ]


def string_to_query(phrase):
 tokenizer = re.finditer(r'"([^"]+)"|(w+)', phrase)
 query = list(parser(tokenizer))

 or_clauses = 'bool': 'should': [
 'bool': 'must': convert_and_clauses(clauses)
 for clauses in query
 ]

 return 'query': or_clauses

Example usage:

>>> string_to_query('doctor and heart')
'query': 'bool': 'should': ['bool': 'must': ['multi_match': 'fields': ['Review.Text',
 'Review.Title'],
 'query': 'doctor',
 'type': 'phrase',
 'multi_match': 'fields': ['Review.Text',
 'Review.Title'],
 'query': 'heart',
 'type': 'phrase']]
>>> string_to_query('"Tom and Jerry" or "Road runner and vil coyote"')
'query': 'bool': 'should': ['bool': 'must': ['multi_match': 'fields': ['Review.Text',
 'Review.Title'],
 'query': '"Tom '
 'and '
 'Jerry"',
 'type': 'phrase'],
 'bool': 'must': ['multi_match': 'fields': ['Review.Text',
 'Review.Title'],
 'query': '"Road '
 'runner '
 'and '
 'vil '
 'coyote"',
 'type': 'phrase']]
>>> string_to_query('cat and cat food or dog and dog food')
'query': 'bool': 'should': ['bool': 'must': ['multi_match': 'fields': ['Review.Text',
 'Review.Title'],
 'query': 'cat',
 'type': 'phrase',
 'multi_match': 'fields': ['Review.Text',
 'Review.Title'],
 'query': 'cat '
 'food',
 'type': 'phrase'],
 'bool': 'must': ['multi_match': 'fields': ['Review.Text',
 'Review.Title'],
 'query': 'dog',
 'type': 'phrase',
 'multi_match': 'fields': ['Review.Text',
 'Review.Title'],
 'query': 'dog '
 'food',
 'type': 'phrase']]

answered Mar 22 at 10:01

Mathias Ettinger

21.9k32876

This is exactly what we needed!!!! Your solution is much appreciated! Would you be ok with releasing this code in a generalised way (fields should be given as args) under a MIT license to pypi ? I could take care of packaging, readme, tests and upload to pypi / github and list you as author
â€“Â Johannes valbjÃ¸rn
Mar 22 at 10:47

@JohannesvalbjÃ¸rn Sure. What IÃ¢Â€Â™ve made in the past is to create a pull-request so my name is associated exactly to the work provided and not necessarily to the whole project.
â€“Â Mathias Ettinger
Mar 22 at 10:55

good idea!!! i've created a public repo, youre more than welcome to create a PR to it : github.com/trustpilot/python-stringtoesquery
â€“Â Johannes valbjÃ¸rn
Mar 22 at 11:03

add a commentÂ |Â

up vote
4
down vote

accepted

Your usage of split makes your function rather fragile:

>>> string_to_query('doctor and heart')
'query': 'bool': 'should': ['bool': 'must': ['bool': 'should': ['match': 'Review.Text': 'operator': 'and',
 'query': '',
 'match': 'Review.Title': 'operator': 'and',
 'query': ''],
 'bool': 'should': ['match': 'Review.Text': 'operator': 'and',
 'query': ' '
 'heart',
 'match': 'Review.Title': 'operator': 'and',
 'query': ' '
 'heart']],
 'match': 'Review.Text': 'operator': 'and',
 'query': 'doct',
 'match': 'Review.Title': 'operator': 'and',
 'query': 'doct']

Which is equivalent to: "(the empty string AND heart) OR doct" rather than "doctor AND heart".

[
 [A, B, C],
 [D, E],
 [F],
]

'multi_match': 
 'query': clause,
 'type': 'phrase',
 'fields': ['Review.Text', 'Review.Title'],

With all the advantages of the multi-match query such as giving more weight to a single field.

The following rewrite extend the supported syntax to allow double quotes to mean "perfect match":

import re


class ClauseParser:
 def __init__(self, tokenizer, *operators):
 self._tokenizer = tokenizer
 self._operators = set(operators)
 self._found_operator = None

 def __iter__(self):
 for token in self._tokenizer:
 token_value = token.group(0)
 if token.group(2) in self._operators:
 self._found_operator = token_value
 return
 yield token_value

 @property
 def operator(self):
 found_operator = self._found_operator
 self._found_operator = None
 return found_operator


def parser(tokenizer):
 clause_parser = ClauseParser(tokenizer, 'and', 'or')
 current_group = 
 while True:
 current_group.append(' '.join(clause_parser))
 found_operator = clause_parser.operator
 if found_operator != 'and':
 yield current_group
 if found_operator is None:
 return
 current_group = 


def convert_and_clauses(clauses):
 return [
 'multi_match': 
 'query': clause,
 'type': 'phrase',
 'fields': ['Review.Text', 'Review.Title'],
 for clause in clauses
 ]


def string_to_query(phrase):
 tokenizer = re.finditer(r'"([^"]+)"|(w+)', phrase)
 query = list(parser(tokenizer))

 or_clauses = 'bool': 'should': [
 'bool': 'must': convert_and_clauses(clauses)
 for clauses in query
 ]

 return 'query': or_clauses

Example usage:

>>> string_to_query('doctor and heart')
'query': 'bool': 'should': ['bool': 'must': ['multi_match': 'fields': ['Review.Text',
 'Review.Title'],
 'query': 'doctor',
 'type': 'phrase',
 'multi_match': 'fields': ['Review.Text',
 'Review.Title'],
 'query': 'heart',
 'type': 'phrase']]
>>> string_to_query('"Tom and Jerry" or "Road runner and vil coyote"')
'query': 'bool': 'should': ['bool': 'must': ['multi_match': 'fields': ['Review.Text',
 'Review.Title'],
 'query': '"Tom '
 'and '
 'Jerry"',
 'type': 'phrase'],
 'bool': 'must': ['multi_match': 'fields': ['Review.Text',
 'Review.Title'],
 'query': '"Road '
 'runner '
 'and '
 'vil '
 'coyote"',
 'type': 'phrase']]
>>> string_to_query('cat and cat food or dog and dog food')
'query': 'bool': 'should': ['bool': 'must': ['multi_match': 'fields': ['Review.Text',
 'Review.Title'],
 'query': 'cat',
 'type': 'phrase',
 'multi_match': 'fields': ['Review.Text',
 'Review.Title'],
 'query': 'cat '
 'food',
 'type': 'phrase'],
 'bool': 'must': ['multi_match': 'fields': ['Review.Text',
 'Review.Title'],
 'query': 'dog',
 'type': 'phrase',
 'multi_match': 'fields': ['Review.Text',
 'Review.Title'],
 'query': 'dog '
 'food',
 'type': 'phrase']]

answered Mar 22 at 10:01

Mathias Ettinger

21.9k32876

This is exactly what we needed!!!! Your solution is much appreciated! Would you be ok with releasing this code in a generalised way (fields should be given as args) under a MIT license to pypi ? I could take care of packaging, readme, tests and upload to pypi / github and list you as author
â€“Â Johannes valbjÃ¸rn
Mar 22 at 10:47

@JohannesvalbjÃ¸rn Sure. What IÃ¢Â€Â™ve made in the past is to create a pull-request so my name is associated exactly to the work provided and not necessarily to the whole project.
â€“Â Mathias Ettinger
Mar 22 at 10:55

good idea!!! i've created a public repo, youre more than welcome to create a PR to it : github.com/trustpilot/python-stringtoesquery
â€“Â Johannes valbjÃ¸rn
Mar 22 at 11:03

add a commentÂ |Â

up vote
4
down vote

accepted

Your usage of split makes your function rather fragile:

>>> string_to_query('doctor and heart')
'query': 'bool': 'should': ['bool': 'must': ['bool': 'should': ['match': 'Review.Text': 'operator': 'and',
 'query': '',
 'match': 'Review.Title': 'operator': 'and',
 'query': ''],
 'bool': 'should': ['match': 'Review.Text': 'operator': 'and',
 'query': ' '
 'heart',
 'match': 'Review.Title': 'operator': 'and',
 'query': ' '
 'heart']],
 'match': 'Review.Text': 'operator': 'and',
 'query': 'doct',
 'match': 'Review.Title': 'operator': 'and',
 'query': 'doct']

Which is equivalent to: "(the empty string AND heart) OR doct" rather than "doctor AND heart".

[
 [A, B, C],
 [D, E],
 [F],
]

'multi_match': 
 'query': clause,
 'type': 'phrase',
 'fields': ['Review.Text', 'Review.Title'],

With all the advantages of the multi-match query such as giving more weight to a single field.

The following rewrite extend the supported syntax to allow double quotes to mean "perfect match":

import re


class ClauseParser:
 def __init__(self, tokenizer, *operators):
 self._tokenizer = tokenizer
 self._operators = set(operators)
 self._found_operator = None

 def __iter__(self):
 for token in self._tokenizer:
 token_value = token.group(0)
 if token.group(2) in self._operators:
 self._found_operator = token_value
 return
 yield token_value

 @property
 def operator(self):
 found_operator = self._found_operator
 self._found_operator = None
 return found_operator


def parser(tokenizer):
 clause_parser = ClauseParser(tokenizer, 'and', 'or')
 current_group = 
 while True:
 current_group.append(' '.join(clause_parser))
 found_operator = clause_parser.operator
 if found_operator != 'and':
 yield current_group
 if found_operator is None:
 return
 current_group = 


def convert_and_clauses(clauses):
 return [
 'multi_match': 
 'query': clause,
 'type': 'phrase',
 'fields': ['Review.Text', 'Review.Title'],
 for clause in clauses
 ]


def string_to_query(phrase):
 tokenizer = re.finditer(r'"([^"]+)"|(w+)', phrase)
 query = list(parser(tokenizer))

 or_clauses = 'bool': 'should': [
 'bool': 'must': convert_and_clauses(clauses)
 for clauses in query
 ]

 return 'query': or_clauses

Example usage:

>>> string_to_query('doctor and heart')
'query': 'bool': 'should': ['bool': 'must': ['multi_match': 'fields': ['Review.Text',
 'Review.Title'],
 'query': 'doctor',
 'type': 'phrase',
 'multi_match': 'fields': ['Review.Text',
 'Review.Title'],
 'query': 'heart',
 'type': 'phrase']]
>>> string_to_query('"Tom and Jerry" or "Road runner and vil coyote"')
'query': 'bool': 'should': ['bool': 'must': ['multi_match': 'fields': ['Review.Text',
 'Review.Title'],
 'query': '"Tom '
 'and '
 'Jerry"',
 'type': 'phrase'],
 'bool': 'must': ['multi_match': 'fields': ['Review.Text',
 'Review.Title'],
 'query': '"Road '
 'runner '
 'and '
 'vil '
 'coyote"',
 'type': 'phrase']]
>>> string_to_query('cat and cat food or dog and dog food')
'query': 'bool': 'should': ['bool': 'must': ['multi_match': 'fields': ['Review.Text',
 'Review.Title'],
 'query': 'cat',
 'type': 'phrase',
 'multi_match': 'fields': ['Review.Text',
 'Review.Title'],
 'query': 'cat '
 'food',
 'type': 'phrase'],
 'bool': 'must': ['multi_match': 'fields': ['Review.Text',
 'Review.Title'],
 'query': 'dog',
 'type': 'phrase',
 'multi_match': 'fields': ['Review.Text',
 'Review.Title'],
 'query': 'dog '
 'food',
 'type': 'phrase']]

answered Mar 22 at 10:01

Mathias Ettinger

21.9k32876

Your usage of split makes your function rather fragile:

>>> string_to_query('doctor and heart')
'query': 'bool': 'should': ['bool': 'must': ['bool': 'should': ['match': 'Review.Text': 'operator': 'and',
 'query': '',
 'match': 'Review.Title': 'operator': 'and',
 'query': ''],
 'bool': 'should': ['match': 'Review.Text': 'operator': 'and',
 'query': ' '
 'heart',
 'match': 'Review.Title': 'operator': 'and',
 'query': ' '
 'heart']],
 'match': 'Review.Text': 'operator': 'and',
 'query': 'doct',
 'match': 'Review.Title': 'operator': 'and',
 'query': 'doct']

Which is equivalent to: "(the empty string AND heart) OR doct" rather than "doctor AND heart".

[
 [A, B, C],
 [D, E],
 [F],
]

'multi_match': 
 'query': clause,
 'type': 'phrase',
 'fields': ['Review.Text', 'Review.Title'],

With all the advantages of the multi-match query such as giving more weight to a single field.

The following rewrite extend the supported syntax to allow double quotes to mean "perfect match":

import re


class ClauseParser:
 def __init__(self, tokenizer, *operators):
 self._tokenizer = tokenizer
 self._operators = set(operators)
 self._found_operator = None

 def __iter__(self):
 for token in self._tokenizer:
 token_value = token.group(0)
 if token.group(2) in self._operators:
 self._found_operator = token_value
 return
 yield token_value

 @property
 def operator(self):
 found_operator = self._found_operator
 self._found_operator = None
 return found_operator


def parser(tokenizer):
 clause_parser = ClauseParser(tokenizer, 'and', 'or')
 current_group = 
 while True:
 current_group.append(' '.join(clause_parser))
 found_operator = clause_parser.operator
 if found_operator != 'and':
 yield current_group
 if found_operator is None:
 return
 current_group = 


def convert_and_clauses(clauses):
 return [
 'multi_match': 
 'query': clause,
 'type': 'phrase',
 'fields': ['Review.Text', 'Review.Title'],
 for clause in clauses
 ]


def string_to_query(phrase):
 tokenizer = re.finditer(r'"([^"]+)"|(w+)', phrase)
 query = list(parser(tokenizer))

 or_clauses = 'bool': 'should': [
 'bool': 'must': convert_and_clauses(clauses)
 for clauses in query
 ]

 return 'query': or_clauses

Example usage:

>>> string_to_query('doctor and heart')
'query': 'bool': 'should': ['bool': 'must': ['multi_match': 'fields': ['Review.Text',
 'Review.Title'],
 'query': 'doctor',
 'type': 'phrase',
 'multi_match': 'fields': ['Review.Text',
 'Review.Title'],
 'query': 'heart',
 'type': 'phrase']]
>>> string_to_query('"Tom and Jerry" or "Road runner and vil coyote"')
'query': 'bool': 'should': ['bool': 'must': ['multi_match': 'fields': ['Review.Text',
 'Review.Title'],
 'query': '"Tom '
 'and '
 'Jerry"',
 'type': 'phrase'],
 'bool': 'must': ['multi_match': 'fields': ['Review.Text',
 'Review.Title'],
 'query': '"Road '
 'runner '
 'and '
 'vil '
 'coyote"',
 'type': 'phrase']]
>>> string_to_query('cat and cat food or dog and dog food')
'query': 'bool': 'should': ['bool': 'must': ['multi_match': 'fields': ['Review.Text',
 'Review.Title'],
 'query': 'cat',
 'type': 'phrase',
 'multi_match': 'fields': ['Review.Text',
 'Review.Title'],
 'query': 'cat '
 'food',
 'type': 'phrase'],
 'bool': 'must': ['multi_match': 'fields': ['Review.Text',
 'Review.Title'],
 'query': 'dog',
 'type': 'phrase',
 'multi_match': 'fields': ['Review.Text',
 'Review.Title'],
 'query': 'dog '
 'food',
 'type': 'phrase']]

answered Mar 22 at 10:01

Mathias Ettinger

21.9k32876

answered Mar 22 at 10:01

Mathias Ettinger

21.9k32876

answered Mar 22 at 10:01

Mathias Ettinger

21.9k32876

answered Mar 22 at 10:01

Mathias Ettinger

21.9k32876

This is exactly what we needed!!!! Your solution is much appreciated! Would you be ok with releasing this code in a generalised way (fields should be given as args) under a MIT license to pypi ? I could take care of packaging, readme, tests and upload to pypi / github and list you as author
â€“Â Johannes valbjÃ¸rn
Mar 22 at 10:47

@JohannesvalbjÃ¸rn Sure. What IÃ¢Â€Â™ve made in the past is to create a pull-request so my name is associated exactly to the work provided and not necessarily to the whole project.
â€“Â Mathias Ettinger
Mar 22 at 10:55

good idea!!! i've created a public repo, youre more than welcome to create a PR to it : github.com/trustpilot/python-stringtoesquery
â€“Â Johannes valbjÃ¸rn
Mar 22 at 11:03

add a commentÂ |Â

This is exactly what we needed!!!! Your solution is much appreciated! Would you be ok with releasing this code in a generalised way (fields should be given as args) under a MIT license to pypi ? I could take care of packaging, readme, tests and upload to pypi / github and list you as author
â€“Â Johannes valbjÃ¸rn
Mar 22 at 10:47

@JohannesvalbjÃ¸rn Sure. What IÃ¢Â€Â™ve made in the past is to create a pull-request so my name is associated exactly to the work provided and not necessarily to the whole project.
â€“Â Mathias Ettinger
Mar 22 at 10:55

good idea!!! i've created a public repo, youre more than welcome to create a PR to it : github.com/trustpilot/python-stringtoesquery
â€“Â Johannes valbjÃ¸rn
Mar 22 at 11:03

This is exactly what we needed!!!! Your solution is much appreciated! Would you be ok with releasing this code in a generalised way (fields should be given as args) under a MIT license to pypi ? I could take care of packaging, readme, tests and upload to pypi / github and list you as author
â€“Â Johannes valbjÃ¸rn
Mar 22 at 10:47

@JohannesvalbjÃ¸rn Sure. What IÃ¢Â€Â™ve made in the past is to create a pull-request so my name is associated exactly to the work provided and not necessarily to the whole project.
â€“Â Mathias Ettinger
Mar 22 at 10:55

good idea!!! i've created a public repo, youre more than welcome to create a PR to it : github.com/trustpilot/python-stringtoesquery
â€“Â Johannes valbjÃ¸rn
Mar 22 at 11:03

add a commentÂ |Â

up vote
1
down vote

I have come up with another solution that yields much simpler results.
It uses query_string searching and the builtin boolean expressions over a set of defined fields:

def string_to_query(s):
 s = s.lower()
 tokens = [' '.join(t.split()) for t in s.split('or')]
 or_terms = 

 while tokens:
 leaf = tokens.pop()

 and_terms = leaf.split('and')
 if len(and_terms) < 2:
 term = and_terms[0]
 or_terms.append('""'.format(term.strip()))
 else:
 and_terms = ['""'.format(term.strip()) for term in and_terms]
 and_string = "( " + " AND ".join(and_terms) + " )"
 or_terms.append(and_string)

 query_string = " OR ".join(or_terms)
 return 
 "query": 
 "query_string": 
 "fields": ["Review.Title", "Review.Text"],
 "query": query_string
 
 
 

query = string_to_query(
 'dog and dog food or cat and cat food'
)

assert query == 
"query": 
 "query_string": 
 "fields": [
 "Review.Title",
 "Review.Text"
 ],
 "query": "( "cat" AND "cat food" ) OR ( "dog" AND "dog food" )"

answered Mar 22 at 7:29

Johannes valbjÃ¸rn

435

add a commentÂ |Â

up vote
1
down vote

I have come up with another solution that yields much simpler results.
It uses query_string searching and the builtin boolean expressions over a set of defined fields:

def string_to_query(s):
 s = s.lower()
 tokens = [' '.join(t.split()) for t in s.split('or')]
 or_terms = 

 while tokens:
 leaf = tokens.pop()

 and_terms = leaf.split('and')
 if len(and_terms) < 2:
 term = and_terms[0]
 or_terms.append('""'.format(term.strip()))
 else:
 and_terms = ['""'.format(term.strip()) for term in and_terms]
 and_string = "( " + " AND ".join(and_terms) + " )"
 or_terms.append(and_string)

 query_string = " OR ".join(or_terms)
 return 
 "query": 
 "query_string": 
 "fields": ["Review.Title", "Review.Text"],
 "query": query_string
 
 
 

query = string_to_query(
 'dog and dog food or cat and cat food'
)

assert query == 
"query": 
 "query_string": 
 "fields": [
 "Review.Title",
 "Review.Text"
 ],
 "query": "( "cat" AND "cat food" ) OR ( "dog" AND "dog food" )"

answered Mar 22 at 7:29

Johannes valbjÃ¸rn

435

add a commentÂ |Â

up vote
1
down vote

I have come up with another solution that yields much simpler results.
It uses query_string searching and the builtin boolean expressions over a set of defined fields:

def string_to_query(s):
 s = s.lower()
 tokens = [' '.join(t.split()) for t in s.split('or')]
 or_terms = 

 while tokens:
 leaf = tokens.pop()

 and_terms = leaf.split('and')
 if len(and_terms) < 2:
 term = and_terms[0]
 or_terms.append('""'.format(term.strip()))
 else:
 and_terms = ['""'.format(term.strip()) for term in and_terms]
 and_string = "( " + " AND ".join(and_terms) + " )"
 or_terms.append(and_string)

 query_string = " OR ".join(or_terms)
 return 
 "query": 
 "query_string": 
 "fields": ["Review.Title", "Review.Text"],
 "query": query_string
 
 
 

query = string_to_query(
 'dog and dog food or cat and cat food'
)

assert query == 
"query": 
 "query_string": 
 "fields": [
 "Review.Title",
 "Review.Text"
 ],
 "query": "( "cat" AND "cat food" ) OR ( "dog" AND "dog food" )"

answered Mar 22 at 7:29

Johannes valbjÃ¸rn

435

I have come up with another solution that yields much simpler results.
It uses query_string searching and the builtin boolean expressions over a set of defined fields:

def string_to_query(s):
 s = s.lower()
 tokens = [' '.join(t.split()) for t in s.split('or')]
 or_terms = 

 while tokens:
 leaf = tokens.pop()

 and_terms = leaf.split('and')
 if len(and_terms) < 2:
 term = and_terms[0]
 or_terms.append('""'.format(term.strip()))
 else:
 and_terms = ['""'.format(term.strip()) for term in and_terms]
 and_string = "( " + " AND ".join(and_terms) + " )"
 or_terms.append(and_string)

 query_string = " OR ".join(or_terms)
 return 
 "query": 
 "query_string": 
 "fields": ["Review.Title", "Review.Text"],
 "query": query_string
 
 
 

query = string_to_query(
 'dog and dog food or cat and cat food'
)

assert query == 
"query": 
 "query_string": 
 "fields": [
 "Review.Title",
 "Review.Text"
 ],
 "query": "( "cat" AND "cat food" ) OR ( "dog" AND "dog food" )"

answered Mar 22 at 7:29

Johannes valbjÃ¸rn

435

answered Mar 22 at 7:29

Johannes valbjÃ¸rn

435

answered Mar 22 at 7:29

Johannes valbjÃ¸rn

435

answered Mar 22 at 7:29

Johannes valbjÃ¸rn

435

add a commentÂ |Â

draft saved

draft discarded

draft saved

draft discarded

Post as a guest

Name

搜尋此網誌

trjhtr