Scan files and pick overlapped hex codes

Clash Royale CLAN TAG#URR8PPP
.everyoneloves__top-leaderboard:empty,.everyoneloves__mid-leaderboard:empty margin-bottom:0;
up vote
4
down vote
favorite
Problem
Two folders having files with nameEfile that store lines starting with hex codes.
Print those hex codes that overlap in both folders.
Solution
import os
import sys
import re
def process_cache(cache):
event_code_set = set()
for file_name, multi_line_content in cache.items():
if file_name.endswith('Efile'):
for line in multi_line_content.splitlines():
line = line.rstrip('\') # trailing backslash, if exist
if bool(re.search(r'^0[xX][0-9a-fA-F]+', line)):
# Take the hexcode
obj = re.search(r'^0[xX][0-9a-fA-F]+', line)
event_code_set.add(obj.string[obj.start():obj.end()])
return event_code_set
def scan_files(dir):
cache =
for root, dirs, files in os.walk(dir):
for name in files:
if name in ('Efile'):
path = os.path.join(root, name)
with open(path,'r') as file:
cache[path] = file.read()
return cache
cache1 = scan_files(sys.argv[1])
cache2 = scan_files(sys.argv[2])
cache1_event_code_set = process_cache(cache1)
cache2_event_code_set = process_cache(cache2)
overlap_event_codes = cache1_event_code_set & cache2_event_code_set
print(overlap_event_codes)
For a typical three entries in a file, with comments(#),
0x00010d35 D 11 G 3,0x10009,N R XY.Condition, "(a 0x40001 == H 0x166) && (a 0x11ff8 == I 15)","0x0763ffc2 "
# Below event codes are from vendor xyz
0x84900c5 M 22 Y 1,0x03330469,4,5,6,7,8
0x04b60ff6 L 50 U
0x04c60e07,102 && (a 0x11ff8 == I 15)","0x0763ffc2 "
Picking 0x00010d35, 0x04b60ff6 & 0x84900c5 is the task. Rest of the line is supposed to be ignored
Some entries are multi-line with trailing backslash.
Each file is in megabytes. Total files in both folders - 80
1) Please suggest optimization in the below code, because pattern check is done twice.
if bool(re.search(r'^0[xX][0-9a-fA-F]+', line)):
# Take the hexcode
obj = re.search(r'^0[xX][0-9a-fA-F]+', line)
event_code_set.add(obj.string[obj.start():obj.end()])
2) Please suggest coding style optimizations for tree walk, cache and command line arguments.
python regex
add a comment |Â
up vote
4
down vote
favorite
Problem
Two folders having files with nameEfile that store lines starting with hex codes.
Print those hex codes that overlap in both folders.
Solution
import os
import sys
import re
def process_cache(cache):
event_code_set = set()
for file_name, multi_line_content in cache.items():
if file_name.endswith('Efile'):
for line in multi_line_content.splitlines():
line = line.rstrip('\') # trailing backslash, if exist
if bool(re.search(r'^0[xX][0-9a-fA-F]+', line)):
# Take the hexcode
obj = re.search(r'^0[xX][0-9a-fA-F]+', line)
event_code_set.add(obj.string[obj.start():obj.end()])
return event_code_set
def scan_files(dir):
cache =
for root, dirs, files in os.walk(dir):
for name in files:
if name in ('Efile'):
path = os.path.join(root, name)
with open(path,'r') as file:
cache[path] = file.read()
return cache
cache1 = scan_files(sys.argv[1])
cache2 = scan_files(sys.argv[2])
cache1_event_code_set = process_cache(cache1)
cache2_event_code_set = process_cache(cache2)
overlap_event_codes = cache1_event_code_set & cache2_event_code_set
print(overlap_event_codes)
For a typical three entries in a file, with comments(#),
0x00010d35 D 11 G 3,0x10009,N R XY.Condition, "(a 0x40001 == H 0x166) && (a 0x11ff8 == I 15)","0x0763ffc2 "
# Below event codes are from vendor xyz
0x84900c5 M 22 Y 1,0x03330469,4,5,6,7,8
0x04b60ff6 L 50 U
0x04c60e07,102 && (a 0x11ff8 == I 15)","0x0763ffc2 "
Picking 0x00010d35, 0x04b60ff6 & 0x84900c5 is the task. Rest of the line is supposed to be ignored
Some entries are multi-line with trailing backslash.
Each file is in megabytes. Total files in both folders - 80
1) Please suggest optimization in the below code, because pattern check is done twice.
if bool(re.search(r'^0[xX][0-9a-fA-F]+', line)):
# Take the hexcode
obj = re.search(r'^0[xX][0-9a-fA-F]+', line)
event_code_set.add(obj.string[obj.start():obj.end()])
2) Please suggest coding style optimizations for tree walk, cache and command line arguments.
python regex
Returningcacheis a bug and wrong practice, despite it works...cache1should not point to localcache
â overexchange
Jul 21 at 12:48
Please do not update the code in your question to incorporate feedback from answers, doing so goes against the Question + Answer style of Code Review. This is not a forum where you should keep the most updated version in your question. Please see what you may and may not do after receiving answers.
â Mast
Jul 21 at 13:46
add a comment |Â
up vote
4
down vote
favorite
up vote
4
down vote
favorite
Problem
Two folders having files with nameEfile that store lines starting with hex codes.
Print those hex codes that overlap in both folders.
Solution
import os
import sys
import re
def process_cache(cache):
event_code_set = set()
for file_name, multi_line_content in cache.items():
if file_name.endswith('Efile'):
for line in multi_line_content.splitlines():
line = line.rstrip('\') # trailing backslash, if exist
if bool(re.search(r'^0[xX][0-9a-fA-F]+', line)):
# Take the hexcode
obj = re.search(r'^0[xX][0-9a-fA-F]+', line)
event_code_set.add(obj.string[obj.start():obj.end()])
return event_code_set
def scan_files(dir):
cache =
for root, dirs, files in os.walk(dir):
for name in files:
if name in ('Efile'):
path = os.path.join(root, name)
with open(path,'r') as file:
cache[path] = file.read()
return cache
cache1 = scan_files(sys.argv[1])
cache2 = scan_files(sys.argv[2])
cache1_event_code_set = process_cache(cache1)
cache2_event_code_set = process_cache(cache2)
overlap_event_codes = cache1_event_code_set & cache2_event_code_set
print(overlap_event_codes)
For a typical three entries in a file, with comments(#),
0x00010d35 D 11 G 3,0x10009,N R XY.Condition, "(a 0x40001 == H 0x166) && (a 0x11ff8 == I 15)","0x0763ffc2 "
# Below event codes are from vendor xyz
0x84900c5 M 22 Y 1,0x03330469,4,5,6,7,8
0x04b60ff6 L 50 U
0x04c60e07,102 && (a 0x11ff8 == I 15)","0x0763ffc2 "
Picking 0x00010d35, 0x04b60ff6 & 0x84900c5 is the task. Rest of the line is supposed to be ignored
Some entries are multi-line with trailing backslash.
Each file is in megabytes. Total files in both folders - 80
1) Please suggest optimization in the below code, because pattern check is done twice.
if bool(re.search(r'^0[xX][0-9a-fA-F]+', line)):
# Take the hexcode
obj = re.search(r'^0[xX][0-9a-fA-F]+', line)
event_code_set.add(obj.string[obj.start():obj.end()])
2) Please suggest coding style optimizations for tree walk, cache and command line arguments.
python regex
Problem
Two folders having files with nameEfile that store lines starting with hex codes.
Print those hex codes that overlap in both folders.
Solution
import os
import sys
import re
def process_cache(cache):
event_code_set = set()
for file_name, multi_line_content in cache.items():
if file_name.endswith('Efile'):
for line in multi_line_content.splitlines():
line = line.rstrip('\') # trailing backslash, if exist
if bool(re.search(r'^0[xX][0-9a-fA-F]+', line)):
# Take the hexcode
obj = re.search(r'^0[xX][0-9a-fA-F]+', line)
event_code_set.add(obj.string[obj.start():obj.end()])
return event_code_set
def scan_files(dir):
cache =
for root, dirs, files in os.walk(dir):
for name in files:
if name in ('Efile'):
path = os.path.join(root, name)
with open(path,'r') as file:
cache[path] = file.read()
return cache
cache1 = scan_files(sys.argv[1])
cache2 = scan_files(sys.argv[2])
cache1_event_code_set = process_cache(cache1)
cache2_event_code_set = process_cache(cache2)
overlap_event_codes = cache1_event_code_set & cache2_event_code_set
print(overlap_event_codes)
For a typical three entries in a file, with comments(#),
0x00010d35 D 11 G 3,0x10009,N R XY.Condition, "(a 0x40001 == H 0x166) && (a 0x11ff8 == I 15)","0x0763ffc2 "
# Below event codes are from vendor xyz
0x84900c5 M 22 Y 1,0x03330469,4,5,6,7,8
0x04b60ff6 L 50 U
0x04c60e07,102 && (a 0x11ff8 == I 15)","0x0763ffc2 "
Picking 0x00010d35, 0x04b60ff6 & 0x84900c5 is the task. Rest of the line is supposed to be ignored
Some entries are multi-line with trailing backslash.
Each file is in megabytes. Total files in both folders - 80
1) Please suggest optimization in the below code, because pattern check is done twice.
if bool(re.search(r'^0[xX][0-9a-fA-F]+', line)):
# Take the hexcode
obj = re.search(r'^0[xX][0-9a-fA-F]+', line)
event_code_set.add(obj.string[obj.start():obj.end()])
2) Please suggest coding style optimizations for tree walk, cache and command line arguments.
python regex
edited Jul 21 at 13:46
Mast
7,30663483
7,30663483
asked Jul 18 at 15:03
overexchange
1,59021945
1,59021945
Returningcacheis a bug and wrong practice, despite it works...cache1should not point to localcache
â overexchange
Jul 21 at 12:48
Please do not update the code in your question to incorporate feedback from answers, doing so goes against the Question + Answer style of Code Review. This is not a forum where you should keep the most updated version in your question. Please see what you may and may not do after receiving answers.
â Mast
Jul 21 at 13:46
add a comment |Â
Returningcacheis a bug and wrong practice, despite it works...cache1should not point to localcache
â overexchange
Jul 21 at 12:48
Please do not update the code in your question to incorporate feedback from answers, doing so goes against the Question + Answer style of Code Review. This is not a forum where you should keep the most updated version in your question. Please see what you may and may not do after receiving answers.
â Mast
Jul 21 at 13:46
Returning
cache is a bug and wrong practice, despite it works... cache1 should not point to local cacheâ overexchange
Jul 21 at 12:48
Returning
cache is a bug and wrong practice, despite it works... cache1 should not point to local cacheâ overexchange
Jul 21 at 12:48
Please do not update the code in your question to incorporate feedback from answers, doing so goes against the Question + Answer style of Code Review. This is not a forum where you should keep the most updated version in your question. Please see what you may and may not do after receiving answers.
â Mast
Jul 21 at 13:46
Please do not update the code in your question to incorporate feedback from answers, doing so goes against the Question + Answer style of Code Review. This is not a forum where you should keep the most updated version in your question. Please see what you may and may not do after receiving answers.
â Mast
Jul 21 at 13:46
add a comment |Â
2 Answers
2
active
oldest
votes
up vote
4
down vote
accepted
Possible bugs
- In one place, you test for
if file_name.endswith('Efile'), and elsewhere, you test forif name in ('Efile'). What do you really intend to do? Are they supposed to be the same test? Can't you just test the filename once? Thename in ('Efile')test is particularly weird, since'fil'would pass the test. Your regex is case insensitive, which implies that you expect to be able to handle both uppercase and lowercase hex strings. But your set operations would treat uppercase and lowercase versions of the same hex code as distinct from each other, which is counterintuitive.
Your sample file shows that the hex codes may have leading zeroes (e.g.
0x00010d35), and that the codes may have varying lengths. So,0x00010d35would be treated differently from0x10d35, which is counterintuitive.The solution to both of these issues is to normalize the codes when constructing the set. One way would be to strip any leading zeroes and convert the string to lowercase. A better efficient solution would be to parse them as integers, since integer comparisons would be more efficient than string comparisons.
What's the point of
line = line.rstrip('\')? But what's the point of meddling with the ends of the lines, when you only care about the beginnings of the lines?On the other hand, your example file suggests that a backslash at the end of the line would be used to indicate that the following line is a continuation of the same record. In that case, what if the continuation line starts with something that looks like a hex code (with no leading whitespace)? Would you count that or not?
Efficiency
You said that each of the ~100 files may be several megabytes long. Your cache object, reads the entire contents of all of the files into memory! There is no good reason to store that much data, when all you care about is the hex codes at the beginning of each line.
If you want to split the work into two functions, I would have one of them be responsible for discovering the relevant file paths, and the other one responsible for extracting the codes.
If you run a regex search many times, it's worth compiling it first. I would design the regex so that it works on the entire file contents at once, rather than line by line.
Suggested solution
import os
import re
import sys
def efiles(dir):
for root, dirs, files in os.walk(dir):
for name in files:
if name.endswith('Efile'):
yield os.path.join(root, name)
def event_codes(file_paths):
hex_re = re.compile(r'(?<!\n)^0[xX][0-9a-fA-F]+', re.MULTILINE)
for path in file_paths:
with open(path) as f:
for event_code in hex_re.findall(f.read()):
yield int(event_code, 16)
event_code_set1 = set(event_codes(efiles(sys.argv[1])))
event_code_set2 = set(event_codes(efiles(sys.argv[2])))
overlap_event_codes = set(hex(n) for n in (event_code_set1 & event_code_set2))
print(overlap_event_codes)
for rstrip? stackoverflow.com/q/51409385/3317808
â overexchange
Jul 19 at 14:10
continuation line starts with something that looks like a hex code (with no leading whitespace)? Am not suppose to count... you can say my code is relying on indentation with white space... which is vulnerable... How do I deal with this? Do i need to remember previous line read?
â overexchange
Jul 19 at 14:11
I actually do not needendwithbecause the file name must beEfile.. Soname == 'Efile'should work
â overexchange
Jul 19 at 14:16
Yes, I am saying that you are relying on the whitespace. My suggested solution solves it by using a negative look-behind assertion in the regex, and applying the regex to the entire file at once rather than line by line.
â 200_success
Jul 19 at 14:22
As file is MB size, can I avoid opening of complete file?
â overexchange
Jul 19 at 23:25
 |Â
show 2 more comments
up vote
5
down vote
Repeating the match can be avoided by storing it in a variable first:
match = re.search(r'^0[xX][0-9a-fA-F]+', line)
if match:
event_codes.add(match.group())
Note also that you don't need to strip a trailing , because it is ignored afterwards anyways and you are only interested in hex codes at the beginnings of the line and all your multilines seem to have some whitespace at the beginning of the continued line. Note that you could have used re.match(r'0[xX][0-9a-fA-F]+', line), instead of re.search, because it always matches only the beginning of the line.
Python also caches your regex, once you have used it, but you may still squeeze out a bit of performance with re.compile.
I also used the match.group() method, which immediately returns the matching part if the string.
I would also not suggest caching the file content of all the files in the directories. This seems like a huge waste of memory. Instead process them one at a time and add it to your event code sets, or even better, just generate a stream of event codes that you can consume with set:
import os
import sys
import re
def event_codes_file(path):
with open(path) as file:
for line in file:
match = re.match(r'0[xX][0-9a-fA-F]+', line)
if match:
yield match.group()
def event_codes_dir(dir):
for root, dirs, files in os.walk(dir):
for name in files:
if name.endswith('Efile'):
path = os.path.join(root, name)
yield from event_codes_file(path)
if __name__ == "__main__":
event_codes1 = set(event_codes_dir(sys.argv[1]))
event_codes2 = set(event_codes_dir(sys.argv[2]))
overlap_event_codes = event_codes1 & event_codes2
print(overlap_event_codes)
yield from x was introduced in Python 3.3 and is mostly equivalent to for i in x: yield i.
Also note that name in ('Efile') is almost the same as name == 'Efile', but matches a bit more (like 'E'). Unless of course you meant what you wrote in process_cache, name.endswith('Efile'). I am assuming the latter.
On a file "test", containing your example content, this produces:
set(event_codes_file("test"))
# '0x00010d35', '0x04b60ff6', '0x84900c5'
Is Generator enabling the code to avoid cache?
â overexchange
Jul 18 at 17:17
@overexchange Basically yes, but it could have been done differently. The important part is processing each file as you discover it, avoiding having to load all of them into memory at once. The generator really only avoids having to carry around a separatesetper file.
â Graipher
Jul 18 at 17:31
Yes... Pipe line processing...
â overexchange
Jul 18 at 17:44
Doeslinenot have trailing backslash? Because you are passing tore.search().re.search(r'\$', 'hellothere')does not work
â overexchange
Jul 18 at 18:51
2
name in ('Efile')is not the same asname == 'Efile'. For example,'fi' in ('Efile')isTrue.
â 200_success
Jul 18 at 21:28
 |Â
show 9 more comments
2 Answers
2
active
oldest
votes
2 Answers
2
active
oldest
votes
active
oldest
votes
active
oldest
votes
up vote
4
down vote
accepted
Possible bugs
- In one place, you test for
if file_name.endswith('Efile'), and elsewhere, you test forif name in ('Efile'). What do you really intend to do? Are they supposed to be the same test? Can't you just test the filename once? Thename in ('Efile')test is particularly weird, since'fil'would pass the test. Your regex is case insensitive, which implies that you expect to be able to handle both uppercase and lowercase hex strings. But your set operations would treat uppercase and lowercase versions of the same hex code as distinct from each other, which is counterintuitive.
Your sample file shows that the hex codes may have leading zeroes (e.g.
0x00010d35), and that the codes may have varying lengths. So,0x00010d35would be treated differently from0x10d35, which is counterintuitive.The solution to both of these issues is to normalize the codes when constructing the set. One way would be to strip any leading zeroes and convert the string to lowercase. A better efficient solution would be to parse them as integers, since integer comparisons would be more efficient than string comparisons.
What's the point of
line = line.rstrip('\')? But what's the point of meddling with the ends of the lines, when you only care about the beginnings of the lines?On the other hand, your example file suggests that a backslash at the end of the line would be used to indicate that the following line is a continuation of the same record. In that case, what if the continuation line starts with something that looks like a hex code (with no leading whitespace)? Would you count that or not?
Efficiency
You said that each of the ~100 files may be several megabytes long. Your cache object, reads the entire contents of all of the files into memory! There is no good reason to store that much data, when all you care about is the hex codes at the beginning of each line.
If you want to split the work into two functions, I would have one of them be responsible for discovering the relevant file paths, and the other one responsible for extracting the codes.
If you run a regex search many times, it's worth compiling it first. I would design the regex so that it works on the entire file contents at once, rather than line by line.
Suggested solution
import os
import re
import sys
def efiles(dir):
for root, dirs, files in os.walk(dir):
for name in files:
if name.endswith('Efile'):
yield os.path.join(root, name)
def event_codes(file_paths):
hex_re = re.compile(r'(?<!\n)^0[xX][0-9a-fA-F]+', re.MULTILINE)
for path in file_paths:
with open(path) as f:
for event_code in hex_re.findall(f.read()):
yield int(event_code, 16)
event_code_set1 = set(event_codes(efiles(sys.argv[1])))
event_code_set2 = set(event_codes(efiles(sys.argv[2])))
overlap_event_codes = set(hex(n) for n in (event_code_set1 & event_code_set2))
print(overlap_event_codes)
for rstrip? stackoverflow.com/q/51409385/3317808
â overexchange
Jul 19 at 14:10
continuation line starts with something that looks like a hex code (with no leading whitespace)? Am not suppose to count... you can say my code is relying on indentation with white space... which is vulnerable... How do I deal with this? Do i need to remember previous line read?
â overexchange
Jul 19 at 14:11
I actually do not needendwithbecause the file name must beEfile.. Soname == 'Efile'should work
â overexchange
Jul 19 at 14:16
Yes, I am saying that you are relying on the whitespace. My suggested solution solves it by using a negative look-behind assertion in the regex, and applying the regex to the entire file at once rather than line by line.
â 200_success
Jul 19 at 14:22
As file is MB size, can I avoid opening of complete file?
â overexchange
Jul 19 at 23:25
 |Â
show 2 more comments
up vote
4
down vote
accepted
Possible bugs
- In one place, you test for
if file_name.endswith('Efile'), and elsewhere, you test forif name in ('Efile'). What do you really intend to do? Are they supposed to be the same test? Can't you just test the filename once? Thename in ('Efile')test is particularly weird, since'fil'would pass the test. Your regex is case insensitive, which implies that you expect to be able to handle both uppercase and lowercase hex strings. But your set operations would treat uppercase and lowercase versions of the same hex code as distinct from each other, which is counterintuitive.
Your sample file shows that the hex codes may have leading zeroes (e.g.
0x00010d35), and that the codes may have varying lengths. So,0x00010d35would be treated differently from0x10d35, which is counterintuitive.The solution to both of these issues is to normalize the codes when constructing the set. One way would be to strip any leading zeroes and convert the string to lowercase. A better efficient solution would be to parse them as integers, since integer comparisons would be more efficient than string comparisons.
What's the point of
line = line.rstrip('\')? But what's the point of meddling with the ends of the lines, when you only care about the beginnings of the lines?On the other hand, your example file suggests that a backslash at the end of the line would be used to indicate that the following line is a continuation of the same record. In that case, what if the continuation line starts with something that looks like a hex code (with no leading whitespace)? Would you count that or not?
Efficiency
You said that each of the ~100 files may be several megabytes long. Your cache object, reads the entire contents of all of the files into memory! There is no good reason to store that much data, when all you care about is the hex codes at the beginning of each line.
If you want to split the work into two functions, I would have one of them be responsible for discovering the relevant file paths, and the other one responsible for extracting the codes.
If you run a regex search many times, it's worth compiling it first. I would design the regex so that it works on the entire file contents at once, rather than line by line.
Suggested solution
import os
import re
import sys
def efiles(dir):
for root, dirs, files in os.walk(dir):
for name in files:
if name.endswith('Efile'):
yield os.path.join(root, name)
def event_codes(file_paths):
hex_re = re.compile(r'(?<!\n)^0[xX][0-9a-fA-F]+', re.MULTILINE)
for path in file_paths:
with open(path) as f:
for event_code in hex_re.findall(f.read()):
yield int(event_code, 16)
event_code_set1 = set(event_codes(efiles(sys.argv[1])))
event_code_set2 = set(event_codes(efiles(sys.argv[2])))
overlap_event_codes = set(hex(n) for n in (event_code_set1 & event_code_set2))
print(overlap_event_codes)
for rstrip? stackoverflow.com/q/51409385/3317808
â overexchange
Jul 19 at 14:10
continuation line starts with something that looks like a hex code (with no leading whitespace)? Am not suppose to count... you can say my code is relying on indentation with white space... which is vulnerable... How do I deal with this? Do i need to remember previous line read?
â overexchange
Jul 19 at 14:11
I actually do not needendwithbecause the file name must beEfile.. Soname == 'Efile'should work
â overexchange
Jul 19 at 14:16
Yes, I am saying that you are relying on the whitespace. My suggested solution solves it by using a negative look-behind assertion in the regex, and applying the regex to the entire file at once rather than line by line.
â 200_success
Jul 19 at 14:22
As file is MB size, can I avoid opening of complete file?
â overexchange
Jul 19 at 23:25
 |Â
show 2 more comments
up vote
4
down vote
accepted
up vote
4
down vote
accepted
Possible bugs
- In one place, you test for
if file_name.endswith('Efile'), and elsewhere, you test forif name in ('Efile'). What do you really intend to do? Are they supposed to be the same test? Can't you just test the filename once? Thename in ('Efile')test is particularly weird, since'fil'would pass the test. Your regex is case insensitive, which implies that you expect to be able to handle both uppercase and lowercase hex strings. But your set operations would treat uppercase and lowercase versions of the same hex code as distinct from each other, which is counterintuitive.
Your sample file shows that the hex codes may have leading zeroes (e.g.
0x00010d35), and that the codes may have varying lengths. So,0x00010d35would be treated differently from0x10d35, which is counterintuitive.The solution to both of these issues is to normalize the codes when constructing the set. One way would be to strip any leading zeroes and convert the string to lowercase. A better efficient solution would be to parse them as integers, since integer comparisons would be more efficient than string comparisons.
What's the point of
line = line.rstrip('\')? But what's the point of meddling with the ends of the lines, when you only care about the beginnings of the lines?On the other hand, your example file suggests that a backslash at the end of the line would be used to indicate that the following line is a continuation of the same record. In that case, what if the continuation line starts with something that looks like a hex code (with no leading whitespace)? Would you count that or not?
Efficiency
You said that each of the ~100 files may be several megabytes long. Your cache object, reads the entire contents of all of the files into memory! There is no good reason to store that much data, when all you care about is the hex codes at the beginning of each line.
If you want to split the work into two functions, I would have one of them be responsible for discovering the relevant file paths, and the other one responsible for extracting the codes.
If you run a regex search many times, it's worth compiling it first. I would design the regex so that it works on the entire file contents at once, rather than line by line.
Suggested solution
import os
import re
import sys
def efiles(dir):
for root, dirs, files in os.walk(dir):
for name in files:
if name.endswith('Efile'):
yield os.path.join(root, name)
def event_codes(file_paths):
hex_re = re.compile(r'(?<!\n)^0[xX][0-9a-fA-F]+', re.MULTILINE)
for path in file_paths:
with open(path) as f:
for event_code in hex_re.findall(f.read()):
yield int(event_code, 16)
event_code_set1 = set(event_codes(efiles(sys.argv[1])))
event_code_set2 = set(event_codes(efiles(sys.argv[2])))
overlap_event_codes = set(hex(n) for n in (event_code_set1 & event_code_set2))
print(overlap_event_codes)
Possible bugs
- In one place, you test for
if file_name.endswith('Efile'), and elsewhere, you test forif name in ('Efile'). What do you really intend to do? Are they supposed to be the same test? Can't you just test the filename once? Thename in ('Efile')test is particularly weird, since'fil'would pass the test. Your regex is case insensitive, which implies that you expect to be able to handle both uppercase and lowercase hex strings. But your set operations would treat uppercase and lowercase versions of the same hex code as distinct from each other, which is counterintuitive.
Your sample file shows that the hex codes may have leading zeroes (e.g.
0x00010d35), and that the codes may have varying lengths. So,0x00010d35would be treated differently from0x10d35, which is counterintuitive.The solution to both of these issues is to normalize the codes when constructing the set. One way would be to strip any leading zeroes and convert the string to lowercase. A better efficient solution would be to parse them as integers, since integer comparisons would be more efficient than string comparisons.
What's the point of
line = line.rstrip('\')? But what's the point of meddling with the ends of the lines, when you only care about the beginnings of the lines?On the other hand, your example file suggests that a backslash at the end of the line would be used to indicate that the following line is a continuation of the same record. In that case, what if the continuation line starts with something that looks like a hex code (with no leading whitespace)? Would you count that or not?
Efficiency
You said that each of the ~100 files may be several megabytes long. Your cache object, reads the entire contents of all of the files into memory! There is no good reason to store that much data, when all you care about is the hex codes at the beginning of each line.
If you want to split the work into two functions, I would have one of them be responsible for discovering the relevant file paths, and the other one responsible for extracting the codes.
If you run a regex search many times, it's worth compiling it first. I would design the regex so that it works on the entire file contents at once, rather than line by line.
Suggested solution
import os
import re
import sys
def efiles(dir):
for root, dirs, files in os.walk(dir):
for name in files:
if name.endswith('Efile'):
yield os.path.join(root, name)
def event_codes(file_paths):
hex_re = re.compile(r'(?<!\n)^0[xX][0-9a-fA-F]+', re.MULTILINE)
for path in file_paths:
with open(path) as f:
for event_code in hex_re.findall(f.read()):
yield int(event_code, 16)
event_code_set1 = set(event_codes(efiles(sys.argv[1])))
event_code_set2 = set(event_codes(efiles(sys.argv[2])))
overlap_event_codes = set(hex(n) for n in (event_code_set1 & event_code_set2))
print(overlap_event_codes)
answered Jul 18 at 22:14
200_success
123k14143399
123k14143399
for rstrip? stackoverflow.com/q/51409385/3317808
â overexchange
Jul 19 at 14:10
continuation line starts with something that looks like a hex code (with no leading whitespace)? Am not suppose to count... you can say my code is relying on indentation with white space... which is vulnerable... How do I deal with this? Do i need to remember previous line read?
â overexchange
Jul 19 at 14:11
I actually do not needendwithbecause the file name must beEfile.. Soname == 'Efile'should work
â overexchange
Jul 19 at 14:16
Yes, I am saying that you are relying on the whitespace. My suggested solution solves it by using a negative look-behind assertion in the regex, and applying the regex to the entire file at once rather than line by line.
â 200_success
Jul 19 at 14:22
As file is MB size, can I avoid opening of complete file?
â overexchange
Jul 19 at 23:25
 |Â
show 2 more comments
for rstrip? stackoverflow.com/q/51409385/3317808
â overexchange
Jul 19 at 14:10
continuation line starts with something that looks like a hex code (with no leading whitespace)? Am not suppose to count... you can say my code is relying on indentation with white space... which is vulnerable... How do I deal with this? Do i need to remember previous line read?
â overexchange
Jul 19 at 14:11
I actually do not needendwithbecause the file name must beEfile.. Soname == 'Efile'should work
â overexchange
Jul 19 at 14:16
Yes, I am saying that you are relying on the whitespace. My suggested solution solves it by using a negative look-behind assertion in the regex, and applying the regex to the entire file at once rather than line by line.
â 200_success
Jul 19 at 14:22
As file is MB size, can I avoid opening of complete file?
â overexchange
Jul 19 at 23:25
for rstrip? stackoverflow.com/q/51409385/3317808
â overexchange
Jul 19 at 14:10
for rstrip? stackoverflow.com/q/51409385/3317808
â overexchange
Jul 19 at 14:10
continuation line starts with something that looks like a hex code (with no leading whitespace)? Am not suppose to count... you can say my code is relying on indentation with white space... which is vulnerable... How do I deal with this? Do i need to remember previous line read?
â overexchange
Jul 19 at 14:11
continuation line starts with something that looks like a hex code (with no leading whitespace)? Am not suppose to count... you can say my code is relying on indentation with white space... which is vulnerable... How do I deal with this? Do i need to remember previous line read?
â overexchange
Jul 19 at 14:11
I actually do not need
endwith because the file name must be Efile.. So name == 'Efile' should workâ overexchange
Jul 19 at 14:16
I actually do not need
endwith because the file name must be Efile.. So name == 'Efile' should workâ overexchange
Jul 19 at 14:16
Yes, I am saying that you are relying on the whitespace. My suggested solution solves it by using a negative look-behind assertion in the regex, and applying the regex to the entire file at once rather than line by line.
â 200_success
Jul 19 at 14:22
Yes, I am saying that you are relying on the whitespace. My suggested solution solves it by using a negative look-behind assertion in the regex, and applying the regex to the entire file at once rather than line by line.
â 200_success
Jul 19 at 14:22
As file is MB size, can I avoid opening of complete file?
â overexchange
Jul 19 at 23:25
As file is MB size, can I avoid opening of complete file?
â overexchange
Jul 19 at 23:25
 |Â
show 2 more comments
up vote
5
down vote
Repeating the match can be avoided by storing it in a variable first:
match = re.search(r'^0[xX][0-9a-fA-F]+', line)
if match:
event_codes.add(match.group())
Note also that you don't need to strip a trailing , because it is ignored afterwards anyways and you are only interested in hex codes at the beginnings of the line and all your multilines seem to have some whitespace at the beginning of the continued line. Note that you could have used re.match(r'0[xX][0-9a-fA-F]+', line), instead of re.search, because it always matches only the beginning of the line.
Python also caches your regex, once you have used it, but you may still squeeze out a bit of performance with re.compile.
I also used the match.group() method, which immediately returns the matching part if the string.
I would also not suggest caching the file content of all the files in the directories. This seems like a huge waste of memory. Instead process them one at a time and add it to your event code sets, or even better, just generate a stream of event codes that you can consume with set:
import os
import sys
import re
def event_codes_file(path):
with open(path) as file:
for line in file:
match = re.match(r'0[xX][0-9a-fA-F]+', line)
if match:
yield match.group()
def event_codes_dir(dir):
for root, dirs, files in os.walk(dir):
for name in files:
if name.endswith('Efile'):
path = os.path.join(root, name)
yield from event_codes_file(path)
if __name__ == "__main__":
event_codes1 = set(event_codes_dir(sys.argv[1]))
event_codes2 = set(event_codes_dir(sys.argv[2]))
overlap_event_codes = event_codes1 & event_codes2
print(overlap_event_codes)
yield from x was introduced in Python 3.3 and is mostly equivalent to for i in x: yield i.
Also note that name in ('Efile') is almost the same as name == 'Efile', but matches a bit more (like 'E'). Unless of course you meant what you wrote in process_cache, name.endswith('Efile'). I am assuming the latter.
On a file "test", containing your example content, this produces:
set(event_codes_file("test"))
# '0x00010d35', '0x04b60ff6', '0x84900c5'
Is Generator enabling the code to avoid cache?
â overexchange
Jul 18 at 17:17
@overexchange Basically yes, but it could have been done differently. The important part is processing each file as you discover it, avoiding having to load all of them into memory at once. The generator really only avoids having to carry around a separatesetper file.
â Graipher
Jul 18 at 17:31
Yes... Pipe line processing...
â overexchange
Jul 18 at 17:44
Doeslinenot have trailing backslash? Because you are passing tore.search().re.search(r'\$', 'hellothere')does not work
â overexchange
Jul 18 at 18:51
2
name in ('Efile')is not the same asname == 'Efile'. For example,'fi' in ('Efile')isTrue.
â 200_success
Jul 18 at 21:28
 |Â
show 9 more comments
up vote
5
down vote
Repeating the match can be avoided by storing it in a variable first:
match = re.search(r'^0[xX][0-9a-fA-F]+', line)
if match:
event_codes.add(match.group())
Note also that you don't need to strip a trailing , because it is ignored afterwards anyways and you are only interested in hex codes at the beginnings of the line and all your multilines seem to have some whitespace at the beginning of the continued line. Note that you could have used re.match(r'0[xX][0-9a-fA-F]+', line), instead of re.search, because it always matches only the beginning of the line.
Python also caches your regex, once you have used it, but you may still squeeze out a bit of performance with re.compile.
I also used the match.group() method, which immediately returns the matching part if the string.
I would also not suggest caching the file content of all the files in the directories. This seems like a huge waste of memory. Instead process them one at a time and add it to your event code sets, or even better, just generate a stream of event codes that you can consume with set:
import os
import sys
import re
def event_codes_file(path):
with open(path) as file:
for line in file:
match = re.match(r'0[xX][0-9a-fA-F]+', line)
if match:
yield match.group()
def event_codes_dir(dir):
for root, dirs, files in os.walk(dir):
for name in files:
if name.endswith('Efile'):
path = os.path.join(root, name)
yield from event_codes_file(path)
if __name__ == "__main__":
event_codes1 = set(event_codes_dir(sys.argv[1]))
event_codes2 = set(event_codes_dir(sys.argv[2]))
overlap_event_codes = event_codes1 & event_codes2
print(overlap_event_codes)
yield from x was introduced in Python 3.3 and is mostly equivalent to for i in x: yield i.
Also note that name in ('Efile') is almost the same as name == 'Efile', but matches a bit more (like 'E'). Unless of course you meant what you wrote in process_cache, name.endswith('Efile'). I am assuming the latter.
On a file "test", containing your example content, this produces:
set(event_codes_file("test"))
# '0x00010d35', '0x04b60ff6', '0x84900c5'
Is Generator enabling the code to avoid cache?
â overexchange
Jul 18 at 17:17
@overexchange Basically yes, but it could have been done differently. The important part is processing each file as you discover it, avoiding having to load all of them into memory at once. The generator really only avoids having to carry around a separatesetper file.
â Graipher
Jul 18 at 17:31
Yes... Pipe line processing...
â overexchange
Jul 18 at 17:44
Doeslinenot have trailing backslash? Because you are passing tore.search().re.search(r'\$', 'hellothere')does not work
â overexchange
Jul 18 at 18:51
2
name in ('Efile')is not the same asname == 'Efile'. For example,'fi' in ('Efile')isTrue.
â 200_success
Jul 18 at 21:28
 |Â
show 9 more comments
up vote
5
down vote
up vote
5
down vote
Repeating the match can be avoided by storing it in a variable first:
match = re.search(r'^0[xX][0-9a-fA-F]+', line)
if match:
event_codes.add(match.group())
Note also that you don't need to strip a trailing , because it is ignored afterwards anyways and you are only interested in hex codes at the beginnings of the line and all your multilines seem to have some whitespace at the beginning of the continued line. Note that you could have used re.match(r'0[xX][0-9a-fA-F]+', line), instead of re.search, because it always matches only the beginning of the line.
Python also caches your regex, once you have used it, but you may still squeeze out a bit of performance with re.compile.
I also used the match.group() method, which immediately returns the matching part if the string.
I would also not suggest caching the file content of all the files in the directories. This seems like a huge waste of memory. Instead process them one at a time and add it to your event code sets, or even better, just generate a stream of event codes that you can consume with set:
import os
import sys
import re
def event_codes_file(path):
with open(path) as file:
for line in file:
match = re.match(r'0[xX][0-9a-fA-F]+', line)
if match:
yield match.group()
def event_codes_dir(dir):
for root, dirs, files in os.walk(dir):
for name in files:
if name.endswith('Efile'):
path = os.path.join(root, name)
yield from event_codes_file(path)
if __name__ == "__main__":
event_codes1 = set(event_codes_dir(sys.argv[1]))
event_codes2 = set(event_codes_dir(sys.argv[2]))
overlap_event_codes = event_codes1 & event_codes2
print(overlap_event_codes)
yield from x was introduced in Python 3.3 and is mostly equivalent to for i in x: yield i.
Also note that name in ('Efile') is almost the same as name == 'Efile', but matches a bit more (like 'E'). Unless of course you meant what you wrote in process_cache, name.endswith('Efile'). I am assuming the latter.
On a file "test", containing your example content, this produces:
set(event_codes_file("test"))
# '0x00010d35', '0x04b60ff6', '0x84900c5'
Repeating the match can be avoided by storing it in a variable first:
match = re.search(r'^0[xX][0-9a-fA-F]+', line)
if match:
event_codes.add(match.group())
Note also that you don't need to strip a trailing , because it is ignored afterwards anyways and you are only interested in hex codes at the beginnings of the line and all your multilines seem to have some whitespace at the beginning of the continued line. Note that you could have used re.match(r'0[xX][0-9a-fA-F]+', line), instead of re.search, because it always matches only the beginning of the line.
Python also caches your regex, once you have used it, but you may still squeeze out a bit of performance with re.compile.
I also used the match.group() method, which immediately returns the matching part if the string.
I would also not suggest caching the file content of all the files in the directories. This seems like a huge waste of memory. Instead process them one at a time and add it to your event code sets, or even better, just generate a stream of event codes that you can consume with set:
import os
import sys
import re
def event_codes_file(path):
with open(path) as file:
for line in file:
match = re.match(r'0[xX][0-9a-fA-F]+', line)
if match:
yield match.group()
def event_codes_dir(dir):
for root, dirs, files in os.walk(dir):
for name in files:
if name.endswith('Efile'):
path = os.path.join(root, name)
yield from event_codes_file(path)
if __name__ == "__main__":
event_codes1 = set(event_codes_dir(sys.argv[1]))
event_codes2 = set(event_codes_dir(sys.argv[2]))
overlap_event_codes = event_codes1 & event_codes2
print(overlap_event_codes)
yield from x was introduced in Python 3.3 and is mostly equivalent to for i in x: yield i.
Also note that name in ('Efile') is almost the same as name == 'Efile', but matches a bit more (like 'E'). Unless of course you meant what you wrote in process_cache, name.endswith('Efile'). I am assuming the latter.
On a file "test", containing your example content, this produces:
set(event_codes_file("test"))
# '0x00010d35', '0x04b60ff6', '0x84900c5'
edited Jul 19 at 7:24
answered Jul 18 at 16:34
Graipher
20.4k42981
20.4k42981
Is Generator enabling the code to avoid cache?
â overexchange
Jul 18 at 17:17
@overexchange Basically yes, but it could have been done differently. The important part is processing each file as you discover it, avoiding having to load all of them into memory at once. The generator really only avoids having to carry around a separatesetper file.
â Graipher
Jul 18 at 17:31
Yes... Pipe line processing...
â overexchange
Jul 18 at 17:44
Doeslinenot have trailing backslash? Because you are passing tore.search().re.search(r'\$', 'hellothere')does not work
â overexchange
Jul 18 at 18:51
2
name in ('Efile')is not the same asname == 'Efile'. For example,'fi' in ('Efile')isTrue.
â 200_success
Jul 18 at 21:28
 |Â
show 9 more comments
Is Generator enabling the code to avoid cache?
â overexchange
Jul 18 at 17:17
@overexchange Basically yes, but it could have been done differently. The important part is processing each file as you discover it, avoiding having to load all of them into memory at once. The generator really only avoids having to carry around a separatesetper file.
â Graipher
Jul 18 at 17:31
Yes... Pipe line processing...
â overexchange
Jul 18 at 17:44
Doeslinenot have trailing backslash? Because you are passing tore.search().re.search(r'\$', 'hellothere')does not work
â overexchange
Jul 18 at 18:51
2
name in ('Efile')is not the same asname == 'Efile'. For example,'fi' in ('Efile')isTrue.
â 200_success
Jul 18 at 21:28
Is Generator enabling the code to avoid cache?
â overexchange
Jul 18 at 17:17
Is Generator enabling the code to avoid cache?
â overexchange
Jul 18 at 17:17
@overexchange Basically yes, but it could have been done differently. The important part is processing each file as you discover it, avoiding having to load all of them into memory at once. The generator really only avoids having to carry around a separate
set per file.â Graipher
Jul 18 at 17:31
@overexchange Basically yes, but it could have been done differently. The important part is processing each file as you discover it, avoiding having to load all of them into memory at once. The generator really only avoids having to carry around a separate
set per file.â Graipher
Jul 18 at 17:31
Yes... Pipe line processing...
â overexchange
Jul 18 at 17:44
Yes... Pipe line processing...
â overexchange
Jul 18 at 17:44
Does
line not have trailing backslash? Because you are passing to re.search(). re.search(r'\$', 'hellothere') does not workâ overexchange
Jul 18 at 18:51
Does
line not have trailing backslash? Because you are passing to re.search(). re.search(r'\$', 'hellothere') does not workâ overexchange
Jul 18 at 18:51
2
2
name in ('Efile') is not the same as name == 'Efile'. For example, 'fi' in ('Efile') is True.â 200_success
Jul 18 at 21:28
name in ('Efile') is not the same as name == 'Efile'. For example, 'fi' in ('Efile') is True.â 200_success
Jul 18 at 21:28
 |Â
show 9 more comments
Sign up or log in
StackExchange.ready(function ()
StackExchange.helpers.onClickDraftSave('#login-link');
);
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
StackExchange.ready(
function ()
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f199759%2fscan-files-and-pick-overlapped-hex-codes%23new-answer', 'question_page');
);
Post as a guest
Sign up or log in
StackExchange.ready(function ()
StackExchange.helpers.onClickDraftSave('#login-link');
);
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Sign up or log in
StackExchange.ready(function ()
StackExchange.helpers.onClickDraftSave('#login-link');
);
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Sign up or log in
StackExchange.ready(function ()
StackExchange.helpers.onClickDraftSave('#login-link');
);
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Returning
cacheis a bug and wrong practice, despite it works...cache1should not point to localcacheâ overexchange
Jul 21 at 12:48
Please do not update the code in your question to incorporate feedback from answers, doing so goes against the Question + Answer style of Code Review. This is not a forum where you should keep the most updated version in your question. Please see what you may and may not do after receiving answers.
â Mast
Jul 21 at 13:46