Notes On Regex Applications
# Recognize Me, Regex’ize Me
We are using Python as illustrations here.
import re
def format_detected(target_strings, patterns_regex, patterns_regex_types):
for i in range(0, len(target_strings)):
for j in range(0, len(patterns_regex_types)):
valid = re.compile(patterns_regex[j])
result = displaymatch(valid.match(test_strings[i]))
if result:
print(target_strings[i] + ": Search successful for " + patterns_regex_types[j])
return None
def displaymatch(match):
if match is None: # no match
return None
return '<Match: %r, groups=%r>' % (match.group(), match.groups())
# ===== [main] =====
target_to_scan_for = ['Email address', 'Private key', 'Seed phrase', 'Wallet address', 'Test case: 0', 'Phone number']
pattern_email = r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b"
patterns = [pattern_email, r"^[A-F0-9]{32}$", r"^(\w+\s){11}\w+$", r"^LB[a-fA-F0-9]{24}$", r"^[a2-9tjqk]{5}$", r"^(\+\d{1,2}\s?)?1?\-?\.?\s?\(?\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4}$"];
test_strings = ["axx@gmail.com", "A6FDF18E86000542388064492B58CBF1", "this is a long string of text consisting of twelve random words", "LB32b787573F5186C696b8ed61", "akt5q", "+1-000-456-7890"];
choice = 0;
# Unit test
valid = re.compile(patterns[choice])
result = displaymatch(valid.match(test_strings[choice]))
print(result)
if result:
print(target_to_scan_for[choice] + ": Search successful.")
else:
print(target_to_scan_for[choice] + ": Search unsuccessful.")
# Full-run test
patterns_regex = patterns
patterns_regex_types = target_to_scan_for
format_detected(test_strings, patterns_regex, patterns_regex_types)
test_string = "The meeting is at 3pm today or 5 A.M. tomorrow or 7 this afternoon or 00:00 midnight --- axx@gmail.com ---. Let's meet at 11.30 p.M. We can also do 8:45 pm or 1200 hr or 00hr."
matches_group_list = get_matches_group_list(pattern_email, test_string)
print(matches_group_list)
matches_locations_list = get_matches_locations(pattern_email, test_string)
print(matches_locations_list)
Sample result:
"""
axx@gmail.com: Search successful for Email address
A6FDF18E86000542388064492B58CBF1: Search successful for Private key
this is a long string of text consisting of twelve random words: Search successful for Seed phrase
LB32b787573F5186C696b8ed61: Search successful for Wallet address
akt5q: Search successful for Test case: 0
+1-000-456-7890: Search successful for Phone number
"""
You can also use short test to observe a specific unit case:
pattern_X = r"..."
test_string = "...."
# Unit test
valid = re.compile(pattern_X)
result = displaymatch(valid.match(test_string))
print(result)
if result:
print(test_string + ": Search successful.")
else:
print(test_string + ": Search unsuccessful.")
Email address
Private key formatted
Seed phrase
Wallet address
Phone number
IP address
Directory path
URL
Test passcode strength requirements
HTML tags
Javascript handlers
Search duplicates in text
Address (Geolocation)
Hexadecimal strings
Linux permissions
File types
Timestamp
Time (without dates)
Specific format
N-gramTemplate Detection
Email headers
In Between Lines
Email address
pattern_email = r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b"
test_string = "axx@gmail.com"
Private key formatted
In this example, it is to identify a string of characters that comprise an example private key. This key would consist of exactly 256 bits (32 characters) in an unspaced, capitalized, hexadecimal string located on one line.
pattern_private_key_formatted = r"^[A-F0–9]{32}$"
test_string = "A6FDF18E86000542388064492B58CBF1"
Seed phrase
In this example, it is to identify a string of characters that comprise a seed phrase consisting of 12 words separated by a single space located on one line.
pattern_seed_phrase = r"^(\w+\s){11}\w+$"
test_string = "this is a long string of text consisting of twelve random words"
Wallet address
In this example, it is to identify a string of characters that comprise an example public wallet address. This address would consist of exactly 24 characters in an unspaced, hexadecimal string preceded by the literal letters “LB”.
pattern_wallet_address = r"^[13][a-km-zA-HJ-NP-Z1-9]{25,34}$"
test_string = "1A1zP1eP5QGefi2DMPTfTL5SLmv7DivfNa"
Phone number
pattern_phone_number = r"^(\+\d{1,2}\s?)?1?\-?\.?\s?\(?\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4}$"
test_string = "+1-000-456-7890"
IP address
pattern_ip_address= r"\b(25[0–5]|2[0–4][0–9]|[01]?[0–9][0–9]?)\.(25[0–5]|2[0–4][0–9]|[01]?[0–9][0–9]?)\.(25[0–5]|2[0–4][0–9]|[01]?[0–9][0–9]?)\.(25[0–5]|2[0–4][0–9]|[01]?[0–9][0–9]?)\b"
# IPv4 only: ^(([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\.){3}([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])$
# IPv6 only: (([0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,7}:|([0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,5}(:[0-9a-fA-F]{1,4}){1,2}|([0-9a-fA-F]{1,4}:){1,4}(:[0-9a-fA-F]{1,4}){1,3}|([0-9a-fA-F]{1,4}:){1,3}(:[0-9a-fA-F]{1,4}){1,4}|([0-9a-fA-F]{1,4}:){1,2}(:[0-9a-fA-F]{1,4}){1,5}|[0-9a-fA-F]{1,4}:((:[0-9a-fA-F]{1,4}){1,6})|:((:[0-9a-fA-F]{1,4}){1,7}|:)|fe80:(:[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|::(ffff(:0{1,4}){0,1}:){0,1}((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])|([0-9a-fA-F]{1,4}:){1,4}:((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9]))
""" e.g.
1200:0000:AB00:1234:0000:2552:7777:1313
1200::AB00:1234::2552:7777:1313
21DA:D3:0:2F3B:2AA:FF:FE28:9C5A
1200:0000:AB00:1234:O000:2552:7777:1313 // invalid characters present
FE80:0000:0000:0000:0202:B3FF:FE1E:8329
[2001:db8:0:1]:80 // valid, no support for port numbers
http://[2001:db8:0:1]:80 // valid, no support for IP address in a URL
"""
# For both IPv4 and IPv6: ((^\s*((([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\.){3}([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5]))\s*$)|(^\s*((([0-9A-Fa-f]{1,4}:){7}([0-9A-Fa-f]{1,4}|:))|(([0-9A-Fa-f]{1,4}:){6}(:[0-9A-Fa-f]{1,4}|((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3})|:))|(([0-9A-Fa-f]{1,4}:){5}(((:[0-9A-Fa-f]{1,4}){1,2})|:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3})|:))|(([0-9A-Fa-f]{1,4}:){4}(((:[0-9A-Fa-f]{1,4}){1,3})|((:[0-9A-Fa-f]{1,4})?:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(([0-9A-Fa-f]{1,4}:){3}(((:[0-9A-Fa-f]{1,4}){1,4})|((:[0-9A-Fa-f]{1,4}){0,2}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(([0-9A-Fa-f]{1,4}:){2}(((:[0-9A-Fa-f]{1,4}){1,5})|((:[0-9A-Fa-f]{1,4}){0,3}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(([0-9A-Fa-f]{1,4}:){1}(((:[0-9A-Fa-f]{1,4}){1,6})|((:[0-9A-Fa-f]{1,4}){0,4}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(:(((:[0-9A-Fa-f]{1,4}){1,7})|((:[0-9A-Fa-f]{1,4}){0,5}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:)))(%.+)?\s*$))
""" e.g.
1200:0000:AB00:1234:0000:2552:7777:1313
21DA:D3:0:2F3B:2AA:FF:FE28:9C5A
1200:0000:AB00:1234:O000:2552:7777:1313 // invalid characters present
FE80:0000:0000:0000:0202:B3FF:FE1E:8329
[2001:db8:0:1]:80 // valid, no support for port numbers
http://[2001:db8:0:1]:80 // valid, no support for IP address in a URL
0.0.0.0
9.255.255.255
"""
test_string = "255.255.11.135" # -ve case: "999.255.11.135"
Directory path
pattern_directory_path= r"([^/\\]*)"
"""
File Path with Filename and extension:
((\/|\\|\/\/|https?:\\\\|https?:\/\/)[a-z0-9 _@\-^!#$%&+={}.\/\\\[\]]+)+\.[a-z]+$
File Path with optional Filename, extension:
^(.+)/([^/]+)$
File Name with extension having 3 chars:
^[\w,\s-]+\.[A-Za-z]{3}$
"""
test_string = "this file is from \My Documents\sensitive\passcodes" # or "this file is from /My Documents/sensitive/passcodes"
URL
# Protocol : http(s)
pattern_X = r"https?:\/\/(www\.)?[-a-zA-Z0–9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0–9@:%_\+.~#()?&//=]*)"
pattern_X = r"(?=(.*[0-9]))(?=.*[\!@#$%^&*()\\[\]{}\-_+=~`|:;\"'<>,./?])(?=.*[a-z])(?=(.*[A-Z]))(?=(.*)).{8,}"
# Protocol Optional
pattern_X = r"(https?:\/\/)?(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)"
# general: url_pattern = "(https?)://(www)?.?(\\w+).(\\w+)/?(\\w+)?"
test_string = "http://foo.com/blah_(wikipedia)#cite-1"
Test passcode strength requirements
Test for specific passcode strength requirements, e.g. should have 1 lowercase letter, 1 uppercase letter, 1 number, 1 special character and be at least 8 characters long
pattern_X = r"(?=(.*[0-9]))(?=.*[\!@#$%^&*()\\[\]{}\-_+=~`|:;\"'<>,./?])(?=.*[a-z])(?=(.*[A-Z]))(?=(.*)).{8,}"
test_string = "sEcReT_*(8???)"
HTML tags
pattern_X = r"<\/?[\w\s]*>|<.+[\W]>"test_string = "<p>\"Hello ... hello.\"</p>"
Javascript handlers
pattern_X = r"\bon\w+=\S+(?=.*>)"
test_string = """<img src="foo.jpg" onload=function_xyz />
<img onmessage="javascript:execute()">
<a notonmessage="nomatch-here" onfocus="alert('hey')" onclick=foo() disabled>
"""
Search duplicates in text
pattern_X = r"(\b\w+\b)(?=.*\b\1\b)"
# needs to be translated from """ ... """ to "..." before use
test_string = """
world .... world .... world world ...
"""
Address (Geolocation)
pattern_geolocation_address = "(\\d*)\\s?(.+),\\s(.+)\\s([A-Z]{2,3})\\s(\\d{4})"
Notes:
"""
(\\d*) := 0 or more digit characters because some addresses do not have house numbers (1st capture group i.e. house number)
\\s? := 0 or 1 whitespace character
(.+) := 1 or more characters (2nd capture group i.e. street name)
, := comma
\\s := a single whitespace character
(.+) := 1 or more characters (3rd capture group i.e. suburb)
\\s := a single whitespace character
([A-Z]{2,3}) := 2 or 3 uppercase letters (4th capture group i.e. state)
\\s := a single whitespace character
(\\d{4}) := 4 digit characters (5th capture group i.e. postcode)
"""
pattern_geolocation_coordinates = "(\d{1,4})([.])(\d{4})[°](\s?)([NS]?)[NSEW](\s?)((\s+?)?)[,](\s?)((\s+?)?)(\d{1,4})([.])(\d{4})[°](\s?)([NS]?)[NSEW]"
test_string = "38.8719° N , 77.0563° SW" # 90.0000° S, 45.0000° E
Hexadecimal strings
pattern_X = r"^(0x|0X)?[a-fA-F0-9]+$"
test_string = "0xaa005a" # aa005a
Linux permissions
pattern_X = r"[0-7]{1,4}" # linux-file-permissions-numeric-notation
test_string = "666" # "0655"
"""
pattern_X = r"[d|-|b|c|l|p|s|D][r|w|x|-]{9}"
test_string = "drwxr-xr-x"
"""
File types
pattern_X = r"(?i)^.*\.(jpg|jpeg|gif|doc|pdf|txt)$" # (?i): case insensitive
test_string = "hello.JpEg" #
Timestamp
pattern_date = (
r"(\d{2}|\d{4})[- \/.]((0|1|2)\d{1})[- \/.](\d{2,4})"
)
test_strings = ["22/02/2222_0000hr", "22.02.2222", "22-02.2222", "2222/02/22", "2222-02/22", "2222-02.22"];
pattern_date = (
r"(\d{1,2}|\d{4})[- /.]((J|j)an(uary)?|(F|f)eb(ruary)?|(M|m)ar(ch)?|(A|a)pr(il)?|(M|m)ay|(J|j)un(e)?|(J|j)ul(y)?|(A|a)ug(ust)?|(S|s)ep(t(ember)?)?|(O|o)ct(ober)?|(N|n)ov(ember)?|(D|d)ec(ember)?)[- /.](\d{1,2}|\d{4})"
)
test_strings = ["02-May-2011", "02 May 2011", "11.may 2011", "11.june 2011", "11.Jul 2011"];
Ref: https://www.regextester.com
Time (without dates)
import re
def get_matches_group_list(pattern, string_target):
matches_group_list = [m.group() for m in re.finditer(pattern, string_target, re.I)]
return matches_group_list
def get_matches_locations(pattern, string_target):
matches_locations_list = [(m.start(), m.end()) for m in re.finditer(pattern, string_target, re.I)]
return matches_locations_list
pattern_time = (
r"\b\d{1,2}([: .]?)(\d{2})?(\s?)((P|p)|(A|a)?)(.?)((M|m)?)(.?)((next|this)?)(\s?)((tomorrow|today|day|evening|morning|(mid?)night|((after|before)?)(noon))?)\b"
)
test_string = "The meeting is at 3pm today or 5 A.M. tomorrow or 7 this afternoon or 00:00 midnight. Let's meet at 11.30 p.M. We can also do 8:45 pm or 1200 hr or 00hr."
matches_group_list = get_matches_group_list(pattern_time, test_string)
print(matches_group_list)
matches_locations_list = get_matches_locations(pattern_time, test_string)
print(matches_locations_list)
Specific format
# Example 1:
pattern_X = r"^[#%][a-zA-Z0-9]{22,30}$"
test_string = "#6FDF18E86000542388064492B58CB" # 30 characters
test_string = "%6FDF18E86000542388064492B58CB" # 30 characters
# Example 2:
length_of_1st_segment = '{5,10}'
pattern_X = "^[#%][a-zA-Z0-9+]"
pattern_X = pattern_X + length_of_1st_segment + "$"
test_string = "#6FDF1+++" # 9 characters
# Example 3:
length_of_1st_segment = '{5,10}'
pattern_X = "^[#%][a-zA-Z0-9]"
pattern_X = pattern_X + length_of_1st_segment
length_of_2nd_segment = '{1,3}'
pattern_X = pattern_X + "[+@]" + length_of_2nd_segment + "$"
test_string = "#6FDF1@@@" # 9 characters
# Example 4:
length_of_1st_segment = '{5,10}'
pattern_X = "^[#%][a-zA-Z0-9]"
pattern_X = pattern_X + length_of_1st_segment
length_of_2nd_segment = '{1,3}'
pattern_X = pattern_X + "[+@]" + length_of_2nd_segment
length_of_3rd_segment = '{2,3}'
pattern_X = pattern_X + "[0-9]" + length_of_3rd_segment + "$"
test_string = "#6FDF1@@@01" # 11 characters
N-gram
# Example 1:
pattern_3_gram_with_numbers = r"\b\w{3}\b"
pattern_3_gram_without_numbers = r"\b[a-zA-Z]{3}\b"
pattern_4_gram = r"\b\w{4}\b"
pattern_4_gram_without_numbers = r"\b[a-zA-Z]{4}\b"
pattern_3_gram_with_numbers_only =r"\b[0-9]{3}\b" # numbers only with length = 3
pattern_3_gram_non_alphanumeric_non_space =r"[^a-zA-Z0-9 ]{3}"
pattern_arbitrary_gram_with_numbers_only_non_space =r"([^a-zA-Z\"=_<>?./\\'{}|+_\-&^%$#@!~`:;, ][0-9]*)"
test_string = "top regular 111 expressions 123523423 halo <<< y 2g2 k0l 111111 "
# Example 2: List the locations as well
import re
pattern_arbitrary_gram_with_numbers_only_non_space =r"([^a-zA-Z\"=_<>?./\\'{}|+_\-&^%$#@!~`:;, ][0-9]*)"
test_string = "top regular 111 expressions 123523423 halo <<< y 2g2 k0l 111111 "
matches = [m.group() for m in re.finditer(pattern_arbitrary_gram_with_numbers_only_non_space, test_string, re.I)]
print(matches)
match_locations = [(m.start(), m.end()) for m in re.finditer(pattern_arbitrary_gram_with_numbers_only_non_space, test_string, re.I)]
print(match_locations)
Template Detection
Email headers
# example of email header template detection
"""
According to Internet Message Format (RFC 5322), mandatory fields include:
Date: Specifies the date and time the message was sent.
From: Specifies the email address of the sender.
To: Specifies the email address(es) of the intended recipient(s).
Message-ID: Provides a unique identifier for the message.
"""
import re
# Example email message
email_message = """
Delivered-To: john.doe@gmail.com
Received: by 10.107.28.21 with SMTP id p21csp515449iof;
Wed, 21 Apr 2023 09:12:14 -0700 (PDT)
X-Received: by 2002:a1c:4c12:: with SMTP id z18mr15577774wmc.160.1618971834241;
Wed, 21 Apr 2023 09:12:14 -0700 (PDT)
Received: from mail.example.com (mail.example.com. [203.0.113.1])
by mx.google.com with ESMTPS id v13si4868822wru.14.2023.04.21.09.12.14
for <john.doe@gmail.com>
(version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256);
Wed, 21 Apr 2023 09:12:14 -0700 (PDT)
From: Jane Doe <jane.doe@example.com>
To: John Doe <john.doe@gmail.com>
Subject: Example email
Message-ID: <CAJG9ZdPY7f4Kv4w_Eh1xjR-e8zm5Wzr5r3qvy3ZiajP8PaQW-g@mail.example.com>
Date: Wed, 21 Apr 2023 16:12:13 +0000
User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:88.0) Gecko/20100101 Thunderbird/88.0
MIME-Version: 1.0
Content-Type: multipart/alternative;
boundary="------------030204060:
This is an example email message.
"""
# Regular expression patterns for email headers
pattern_to = r'^To: (.+)$'
pattern_from = r'^From: (.+)$'
pattern_subject = r'^Subject: (.+)$'
pattern_message_id = r'^Message-ID: (.+)$'
pattern_date = r'^Date: (.+)$'
# Extract the headers using regular expressions
match_to = re.search(pattern_to, email_message, flags=re.MULTILINE)
match_from = re.search(pattern_from, email_message, flags=re.MULTILINE)
match_subject = re.search(pattern_subject, email_message, flags=re.MULTILINE)
match_message_id = re.search(pattern_message_id, email_message, flags=re.MULTILINE)
match_date = re.search(pattern_date, email_message, flags=re.MULTILINE)
# Print the extracted header values
if match_to:
print("To:", match_to.group(1))
if match_from:
print("From:", match_from.group(1))
if match_subject:
print("Subject:", match_subject.group(1))
if match_message_id:
print("Message-ID:", match_message_id.group(1))
if match_date:
print("Date:", match_date.group(1))
if (match_to) and (match_from) and (match_subject) and (match_message_id) and (match_date):
print("[email header template detected]")
In Between Lines
import re
test_strings = """
....
....
[start]
you should
see me
[end]
.....
....
"""
# Define the start and end tags as regular expressions
start_tag = r'\[start\]'
end_tag = r'\[end\]'
# Construct a regex pattern to match everything between the start and end tags
pattern = f'{start_tag}(.*?){end_tag}'
# Use re.findall to extract the text between the tags
matches = re.findall(pattern, test_strings, re.DOTALL)
# Print the extracted text
for match in matches:
print(match.strip()) # Use strip() to remove leading/trailing whitespace
"""
you should
see me
"""
_
import re
test_strings = """
....
....
<input required type="hidden" name="
id_tracking_no value = "
random ...
random ....
"
">
.....
....
"""
# Construct a regex pattern to match everything between the start and end tags
pattern = f'id_tracking_no value = "([^"]+)"'
# Use re.findall to extract the text between the tags
matches = re.findall(pattern, test_strings, re.DOTALL)
# Print the extracted text
for match in matches:
print(match.strip()) # Use strip() to remove leading/trailing whitespace
""" output:
random ...
random ....
"""
_
More detection on using extract and hash compare: Template Matching.
Just in case … here is a good library utility:
https://ihateregex.io/
Caveats
Some life-wasting bugs are almost beyond mortal eyes:
"""
line1 = "^[a-zA-Z0–9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$"
line2 = "^[a-zA-Z0–9_.+-]+@[a-zA-Z0–9-]+\.[a-zA-Z0–9-.]+$"
for i in range(0, len(line1)):
if (line1[i] != line2[i]):
print(line1[i] + " is different from " + line2[i] + " at position " + str(i))
"""
Result:
- is different from – at position 26
- is different from – at position 41