Current scoring arithmetic and rounding - PHP Online
Form of PHP Sandbox
*** This page was generated with the meta tag "noindex, nofollow". This happened because you selected this option before saving or the system detected it as spam. This means that this page will never get into the search engines and the search bot will not crawl it. There is nothing to worry about, you can still share it with anyone.
Enter Your PHP code here for testing/debugging in the Online PHP Sandbox. As in the usual PHP files, you can also add HTML, but do not forget to add the tag <?php
in the places where the PHP script should be executed.
Result of php executing
Full code of Current scoring arithmetic and rounding.php
- <?php
- $receiptTextsAndAssertions = [
- 0 => [
- 'text' => '"dm\ndm-drogerle markt\nKirchhellener Stra\u00dfe 142\nHIER BIN ICH MENSCH\n46145 Oberhausen\n0208/62189892\n08. 03.2021 18:27 2107/2 239059/1 0583\nAok Seesand Peeling WeiB.Tee\n1,95 1\nAok Seesand Peeling WeiB. Tee\n1.95 1\nMixa Bodylotion Hyaluro fresh\n3,75 1\nWC-Frisch Duo Stein Zitrus NF\n1,15 1\nWC-Frisch Duo Stein Zitrus NF\n1,15 1\nSaptil mit B\u00fcrste 200ml\n1,95 1\nZwischensumme\n11,90\nEinsteckalbum 10x15 36 Bilder\n1,45 1\nWC-Frisch Duo Stein Zitrus OR\n1,15 1\nFisherman Pro Fresh Blueberry\n1,15 2\nAok Seesand Peeling WeiB.Tee\n1,95 1\nZwischensumme\n17,60\ndm-Rabatte auf rabattf\u00e4hige Antikel\nPocketalbum Gratis\n-1,45\n4 X Coupon 15% Gesicht\n-1,43\nSUMME EUR\n14,72\nKARTENZAHLUNG EUR\n-14,72\nMwSt-Satz\nBrutto\nNetto\nMwSt\n1=19,00%\n13,57\n11.40\n2.17\n2=7,00%\n1,15\n1,07\n0,08\nIhre PAYBACK Kartennr XXXXXXXXX5273\nPunktestand vor Einkauf: 184\nDieser Punktestand entspricht: 1,84 EUR\nF\u00fcr diesen Einkauf erhalten Sie von dm\nPAYBACK Punkte auf bunktef\u00e4hige Artikel\nBasis-Punkte\n7\u00b0P\n\u00d6ffnungszeiter auf dm. de\nSteuer-Nr. : 34092/30007\n-K-U-N-D-E-N-B-E-L-E-G\nTerminal ID :\n65052321\nTA-Nr 702094\nBNr 9791\nKartenzahlung\ngirocard\nEUR 14,72\nPAN\nKante 8\ng\u00fcltig bis\n##/##\nEMV-AID\nA0000003591010028001\n0511052321\nVU-Nr.\nGenehmigungs-Nr\n168743\nDatum 08.03.21\n18:27 Uhr\nx** Zahlung erfolgt ***\nBITTE BELEG AUFBEWAHREN\n"',
- 'assert_date' => '2021/03/08',//'08.03.2021', //but will be '08.03.2021' in source text
- 'assert_time' => '18:27',
- 'assert_mixa' => true,
- //'assert_fuzzy_coupons' => false,
- ],
- //I HAVE REMOVED "mixa" keywords and retailer keywords
- 1 => [
- 'text' => '"Mein Drogeriemarkt\n26.03.2021 14:30 Bed.Nr.:119 Kasse 02\nKunde:\n********\n70662\nApp\n3600551026565 HYALURO LOTION\n\u20ac3,99 A\n4002448044116 CLEARASIL WASCHGEL\n\u20ac3,79 A\n* 8X 4305615185255 WINSTON K MEN\u00dc JUNIO \u20ac0,19\n\u20ac1,52 B\n* 2X 4305615185279 WINSTON K MEN\u00dc LACHS \u20ac0,19\n\u20ac0,38 B\n2X 4305615185286 WINSTON K MEN\u00dc LAMM/ \u20ac0,19\n\u20ac0,38 B\n* 2X 4305615185330 WINSTON K MENU WELLN \u20ac0,19\n\u20ac0,38 B\n2X 4305615311555 WINSTON K MEN\u00dc MSC\n\u20ac0,19\n\u20ac0,38 B\n4305615416038 GP NUSS CASHEWKERNE\n\u20ac1,99 B\n4305615650845 ENERBIO ROTE B. SAFT\n\u20ac0,89 A\n*\n4305615693019 GP FRUCHTSCHNITTE VI\n\u20ac0,39 B\n* 2X 4305615730363 WINSTON K MEN\u00dc HUHN/ \u20ac0,19\n\u20ac0,38 B\n*\n2X 4305615730370 WINSTON K MEN\u00dc LACHS \u20ac0,19\n\u20ac0,38 B\nZwischensumme\n\u20ac14,85\nIhre Coupon-Ersparnisse heute:\nCoupon GP Riegel\n9823277110491 \u20ac-0,39\n10% AUF ALLES COUPON\n9823219220103 \u20ac-1,45\nMit Coupons gespart:\n\u20ac-1,84\nTotal\n\u20ac13,01\nBar\n\u20ac13,01\nR\u00fcckgeld (Bar)\n\u20ac0,00\nMWST Gruppe\nNetto\nMWST\nTotal\nA MWST A\n19%\n6,57\n1,25\n7,82\nB MWST B\n7%\n4,85\n0,34\n5,19\nMwSt Total\n11,42\n1,59\n13,01\nTSE Transaktion:\n72209\nTSE-Start:\n2021-03-26T13:29:51.000Z\nEnder\n"',
- 'assert_date' => '2021/03/26',//'26.03.2021',
- 'assert_time' => '14:30',
- 'assert_mixa' => true,
- //'assert_fuzzy_coupons' => 100,
- ],
- ];
- foreach($receiptTextsAndAssertions as $textAndAssertions)
- {
- $cleanText = json_decode($textAndAssertions['text']);
- //chop into lines
- $lines = preg_split('|\r{0,1}\n|', $cleanText);
- VAR_DUMP($lines);
- $verifier = new TextVerifier($lines);
- $validityScore = $verifier->getValidityScore();
- //mxa.php does this (so let's check)
- $rounded_validityScore = substr(round($validityScore), -4, 2);
- echo "validityScore is: {$validityScore} -- roundedValidityScore is: {$rounded_validityScore}" . PHP_EOL;
- }
- //----class from here----------------
- class TextVerifier
- {
- //private $metadata; //GET RID
- private $lines;
- //private $linesMetadata; //GET RID!
- //MUST PASS IN THE TEXT $lines ALREADY CLEANED (ie. json_decoded()) AND CHOPPED INTO AN ARRAY OF LINES!
- public function __construct(array $lines/*, $linesMetadata*/)
- {
- $this->lines = $lines;
- //$this->metadata = $linesMetadata; //GET RID
- }
- //apply this class's specific scanning rules to the supplied text.
- //return a number between 0 (no validity confidence) and 100 (full validity confidence)
- public function getValidityScore()
- {
- $rules = $this->getRules();
- $validityScores = [];
- $aggregateValidityScore = [];
- //loop through and get validity confidence for each individual
- foreach ($rules as $ruleName => $ruleWeighting) {
- $ruleResult = $this->{"applyRule_{$ruleName}"}();
- $weightedRuleResult = $ruleWeighting * $ruleResult;
- $validityScores[$ruleName] = $weightedRuleResult;
- echo "Rule [{$ruleName}] gives {$ruleResult} which, with a weighting of {$ruleWeighting}, gives: {$weightedRuleResult} total" . PHP_EOL;
- }
- //TODO from here
- return array_sum($validityScores);
- }
- //per-campaign rules for this calculation
- public function getRules()
- {
- return [
- 'retailer' => 10,
- 'timeframe' => 05,
- //'matching_items' => 75,
- 'contains_mixa' => 75,
- 'is_probably_receipt' => 10
- ];
- }
- //----rule methods----------------
- //these rule method names must be same as the keys in the return from getRules() [but prefixed with '']
- //rule methods return between 0 and 100 (full confidence)
- public function applyRule_retailer_orig() //TODO: seems not at all useful for the grand weighting scheme of things
- {
- //TODO: make it make sense
- $RETAILER_KEYWORD = 'dm-drogerie' or 'dm-drogerle' or 'ROSSMAN';
- $validityConfidence = $this->findFuzzyWord($RETAILER_KEYWORD);
- return $validityConfidence;
- }
- public function applyRule_retailer() //TODO: seems not at all useful for the grand weighting scheme of things
- {
- $RETAILER_KEYWORDS = ['dm-drogerie', 'ROSSMAN'];
- $highestScore = 0;
- foreach($RETAILER_KEYWORDS as $retailer){
- $validityConfidence = $this->findFuzzyWord($retailer);
- if($validityConfidence > $highestScore){
- $highestScore = $validityConfidence;
- }
- }
- //TODO: sort out what we want to do. 85 IS AN EXAMPLE HERE!
- if($highestScore > 85){
- return 100;
- }
- return 0;
- }
- private function applyRule_timeframe_orig() //TODO: is this algorithm still helpful or relvant???
- {
- // TODO : make this make sense
- // sudo code
- // we want want the receipt date
- // check regex date method.
- $start = new DateTime;
- $start->setDate(2021, 3, 8);
- $end = clone $start;
- $end->setDate(2021, 04, 25);
- $interval = new DateInterval('P1D');
- $dateRange = new DatePeriod($start, $interval, $end);
- foreach ($dateRange as $eachDate) {
- $eachDateFormatted = date_format($eachDate, "d.m.Y");
- $validityConfidence = $this->findFuzzyWord($eachDateFormatted);
- }
- return $validityConfidence;
- }
- private function applyRule_timeframe() //TODO: is this validation check still helpful or relvant???
- {
- $receiptDate = $this->extractDateFromText();
- if($receiptDate === false){
- return 50; //or?
- }
- //TODO clever stuff with DateTime object, load in $receiptDate as a DateTime and see if it meets requirements
- return 100;
- }
- // This function checks to see if there any qualifying products on the receipt.
- private function applyRule_matching_items() //TODO:
- // TODO: we only need one qualifying product, but if there are more than one qulaifying product on the receipt
- // is there a way we can *trust* this receipt more
- //algorithm approach, loop thorough all words, get validity scores, pick highest validity score,
- // IF this is higher than our threshold X, return as our strongest match.
- {
- // note, we only need to match *one* of these keywords
- $matching_item_keywords = [
- 'Mixa',
- 'Cica Repair',
- 'Panthenol Comfort',
- 'bodylotion',
- 'lotion',
- 'Shea Ultra Soft Body Milk',
- 'Hyaluro Fresh Body',
- 'Intensiv Straffend',
- ];
- foreach ($matching_item_keywords as $each_keyword) {
- $validityConfidence = $this->findFuzzyWord($each_keyword);
- //log_to_file("The keyword is currently [{$each_keyword}]");
- //log_to_file($this->lines);
- }
- return $validityConfidence;
- }
- //return between 0 - 100
- public function applyRule_contains_mixa()
- {
- $containsMixa = $this->findExactWord('mixa');
- if($containsMixa){
- return 100;
- }
- return 0;
- }
- private function applyRule_is_probably_receipt()
- {
- //this is used to check if a document being scanned is a receipt or not/
- // mwst is the german equivalent of vat, if the receipt has this, pass this check
- $valueKeyword = 'MwSt';
- $validityConfidence = $this->findFuzzyWord($valueKeyword);
- return $validityConfidence;
- }
- //----/end rule methods----------------
- //----text scanning methods----------------
- /**
- *
- * NOTE very heavily assumed to only expect 24-hour format
- *
- *
- * @return false|string false on no time match else the time string in question AND NOTE that this is 'hh:mm' 24-hour clock
- */
- public function extractTimeFromText()
- {
- //APPROACH:
- //[1] glue all text back together
- //[2] run the regex on the glued text
- //[3] if time text is present, return the time text
- //[4] else, return false,
- //[1] glue all text back together
- //$gluedText = implode(' ', $this->lines); //space can get picked up in out main regex and we have cases of matching ACROSS lines!
- $gluedText = implode('&', $this->lines);
- //[2] run the regex on the glued text
- $matchResult = preg_match('|([0-9]{2}[:][0-9]{2})[^0-9]|', $gluedText, $matches); //discount if more digits after right-most time digits
- //[3] if time text is present, return the time text
- if($matchResult === 1) //a match!
- {
- $timeText = $matches[1];
- return $timeText;
- }
- //[4] else, return false,
- return false;
- }
- /**
- *
- * NOTE very heavily assumed to only expect dd/mm/yy or dd/mm/yyyy in the source text
- *
- *
- * @return false|string false on no date match else the date string in question AND NOTE that this is yyyy/mm/dd
- */
- public function extractDateFromText()
- {
- //APPROACH:
- //[1] glue all text back together
- //[2] run the regex on the glued text
- //[3] if date text is present, return the date text
- //[4] else, return false,
- //[1] glue all text back together
- //$gluedText = implode(' ', $this->lines); //space can get picked up in out main regex and we have cases of matching ACROSS lines!
- $gluedText = implode('&', $this->lines);
- //[2] run the regex on the glued text
- $matchResult = preg_match('|([0-9]{2})[\-\./ ]{1}([0-9]{2})[\-\./ ]{1}([0-9]{2,4})[^0-9]|', $gluedText, $matches); //discount if more digits after right-most year digits
- //[3] if date text is present, return the date text
- if ($matchResult === 1) //a match!
- {
- $d = $matches[1];
- $m = $matches[2];
- $y = $matches[3];
- if (strlen($y) == 2) {
- $y = '20' . $y;
- }
- // // even though this date below is ideal because it's a UK / European standard, it is advised to not use it as forge wants
- // // the date in {yyyy-mm-dd} format.
- // //To see Daniel's comment on this, run the SQL -
- // <<<SQL
- // show create table `inspire-flights`.`inspire-flights_applications`
- // SQL;
- // // and read Daniel's comments.
- //$date = "{$d}/{$m}/{$y}";
- // so therefore we do.
- $date = "{$y}/{$m}/{$d}"; //does not match our original docblock! but i have kept this (so as not to worsen data consistency) and altered the docblock. Please be careful! and you will need to alter the backoffice input field label for "receipt date"
- //log_to_file('Receipt date is............');
- //log_to_file(var_export($date));
- return $date;
- // note for Usama - Using curly braces in this context (string literals) is known as 'Complex (curly) syntax'
- //- https://www.ph1`p.net/manual/en/language.types.string.php#language.types.string.parsing.complex
- // IT IS NOT the same as Curly array Syntax which is actually deprecated in PHP 7.4
- //https://wiki.php.net/rfc/deprecate_curly_braces_array_access
- // and removed in PHP 8 which stops this project running in PHP 8 in it's current form.
- } //[4] else, return false,
- else {
- return false;
- }
- }
- //GET RID!
- /*
- // function currently not in use.
- private function findExactText($exactText)
- {
- //$found = false;
- //loop over everything
- foreach ($this->linesMetadata as $line) {
- if (strpos($line['Text'], $exactText)) {
- return true;
- }
- }
- //return bool or line id(s)?
- return true;
- }
- */
- //find single keyword fuzzily
- //
- //TODO need to think about single words versus word phrases (probably)
- //needs return confidence of between 0 to 100
- //
- //this function will return the probability of the specified keyword (a SINGLE keyword, not a phrase)
- //existing verbatim in the source text
- //
- //for example, [] 'Mixa' will match 'Mixa' at 100%
- // [] 'Mixa' willl match 'Mixy' at only 75%
- //
- //NOTE: similar_text() is case-sensitive, which we iron out (we want case-insensitive)
- //
- //NOTE: NOTE: performance concerns(?)
- public function findFuzzyWord($fuzzyWordToFind) //TODO: method name could be better
- {
- $highestMatch = 0;
- //loop over each line
- foreach ($this->lines as $line) {
- $words = $this->getWordsFromPhrase($line);
- //loop over each SINGLE WORD
- foreach ($words as $word) {
- $percentageMatch = 0;
- similar_text(strtolower($fuzzyWordToFind), strtolower($word), $percentage);
- //log_to_file("We want [{$fuzzyTextToFind}] and this matches [{$word}] with [{$percentage}]");
- if ($percentage > $highestMatch) {
- $highestMatch = $percentage;
- }
- //if get 100 match, can return immediately
- if($highestMatch == 100){
- return 100;
- }
- }
- }
- return $highestMatch;
- }
- //find exact single keyword
- //NOTE case-insensitive
- //TODO: think about that currently, this will match 'mixa' with - for example - 'mixay'
- //@return bool true if desired keyword exists EXACTLY in $this->lines
- public function findExactWord($exactWordToFind)
- {
- //[1] glue all text back together
- //$gluedText = implode(' ', $this->lines); //space can get picked up in out main regex and we have cases of matching ACROSS lines!
- $gluedText = implode('§', $this->lines);
- //
- $result = stripos($gluedText, $exactWordToFind);
- return ($result !== false);
- }
- //split a multi-word phrase up into its constituent single words
- public function getWordsFromPhrase($phrase)
- {
- //log_to_file("PHRASE to split is: [{$phrase}]");
- $words = preg_split("/[\s]+/", $phrase); //split on one or more whitespace characters
- //log_to_file("WORDS got from phrase: " . var_export($words, true));
- return $words;
- }
- //----/end text scanning methods------------
- }