C code samples from SpamCheetah

How is SpamCheetah organized?

The developers of SpamCheetah are old school and even in today’s world the major software engines are all powered by C and UNIX.

And it is important to note that many programming languages like perl and python are coded in C.

Node.js is the only exception as it is written in C++.

Since email is a very powerful thing both socially and technically it needs a lot of raw power and that power can be found only in C.

In this blog we shall explore some aspects of the program that makes it so powerful and useful.

The core proxy loop

Here is a sample of the main proxy loop C code.

 setsockopt(serversock, SOL_SOCKET, SO_REUSEADDR, &on, 1);
  listen(serversock, 1024);

  topfd[0].fd = unixsock; topfd[0].events = POLLIN;

  topfd[1].fd = serversock; topfd[1].events = POLLIN;

POLLBEGIN:

  for (;;) {
    /*
     * The parent is always here, this is where the whole story *
     of this code resides...  */
    tlsmail = 0;

    syslog(LOG_INFO, "Listening for new mail connections");
    syslog(LOG_INFO, "The mail server IP is %s\n", mtaip); /* Doing
    a poll with infinite timeout for read... */ retval = poll(topfd,
    2, -1);


    /* XXX serversock This handles the mail proxying */ if
    (topfd[1].revents & POLLIN) {
      mailacc = accept(serversock, (struct sockaddr *)&clientaddr,
      &l); if (mailacc == -1) {
        syslog(LOG_ERR, "TCP accept failed"); syslog(LOG_ERR, "accept:
        %s", strerror(errno));
      } syslog(LOG_INFO, "New connection from %s:%d",
             inet_ntoa(clientaddr.sin_addr),
             ntohs(clientaddr.sin_port));

      globalcnt.totalmailattemptcnt++; writenext = 0;
      strncpy(envip, inet_ntoa(clientaddr.sin_addr), sizeof(envip));
      strncpy(child_struct.envip, envip, strlen(envip));

      /* XXX Check whether we can receive mail from this IP */

From the above you know that the sockets are setup for polling and looping. Some flags are set too.

We also have a UNIX domain socket for querying metrics/latency/stats etc.

The UNIX domain socket also follows a fork -> process model.

The benefit of using fork(2) in addition to performance is if it crashes then the parent keeps running.

/* Fork(2) stuff */
signal(SIGCHLD, waitforkidandkill);
parentpid = getpid();
pid = fork();
if (pid == -1) {
  syslog(LOG_ERR, "fork: %s", strerror(errno));
  syslog(LOG_ERR, "Fork() failed, very bad");
}
if (pid != 0) { /* Parent process */
  close(mailacc);
  close(connfd);
  signal(SIGUSR1, parent_shm);
  syslog(LOG_INFO, "Waiting for mail processing");
  continue;
  /* XXX Go back to Outside poll(2) */
} else { /* Child process of fork() handle mail
          * proxying in a concurrent way
          */
  connfd = socket(PF_INET, SOCK_STREAM, 0);
  if (connfd == -1) {
    syslog(LOG_ERR, "socket: %s", strerror(errno));
  }

  /* Remote MTA address */
  tmailsrv.sin_addr.s_addr = inet_addr(mtaip);
  tmailsrv.sin_family = AF_INET;
  tmailsrv.sin_port = htons(25);
  l = sizeof(tmailsrv);

  retval = connect(connfd, (struct sockaddr *)&tmailsrv, l);
  if (retval == -1) {
    syslog(LOG_ERR, "connect: %s", strerror(errno));
    syslog(LOG_ERR, "Could not connect to target mail server");
    syslog(LOG_INFO, "Mail Server %s down? Exiting ", mtaip);
    mtadowncnt++;
    syslog(LOG_ERR, "Down for %d counts", mtadowncnt);
    read_shm();
    write_shm(&mtadowncnt, MTADOWN, sizeof(char));
    kill(parentpid, SIGUSR1);
    if (mtadowncnt == 5)
      activate_internal_mta();
    exit(128);
  }
  syslog(LOG_INFO, "Connected to target mail server\n");

  pfd[0].fd = mailacc;
  pfd[0].events = POLLIN;

  pfd[1].fd = connfd;
  pfd[1].events = POLLIN;

  for (;;) {
    /* Doing a poll with infinite timeout for read... */
    retval = poll(pfd, 2, -1);

    if (retval == -1 || retval == 0) {
      syslog(LOG_ERR, "poll: %s", strerror(errno));
      syslog(LOG_ERR, "Inside poll failure: Exiting child");
      /*  Only exit for Child XXX */
      exit(0);
    }

    /* XXX connfd from MTA */
    if (pfd[1].revents & POLLIN) {
      r = read(connfd, buf, sizeof(buf));
      buf[r] = 0;
      if (r == 0 || r == -1) {
        syslog(LOG_INFO, "Target Mail Server sock closed on read(2)");
        shutdown(connfd, SHUT_WR);
        close(connfd);
        shutdown(mailacc, SHUT_WR);
        close(mailacc);

        pfd[1].fd = -1;
        pfd[1].events = 0;
        if (tlsmail) {
          ; // insert_maildb();
        }
        unlink(tmpmailname);
      }
      p = strdup(buf);
      while ((sep = strsep(&p, "\r\n"))) {
        char *t;
        slashr = strchr(sep, '\r');
        if (slashr)
          *slashr = 0;
        if (mailstate == MAIL_SENT && (t = strstr(sep, "250"))) {
          strncpy(mailqid, t + 10, sizeof(mailqid));
          syslog(LOG_INFO, "---MAIL SENT SUCCESSFULLY---");
          extract_header_fields();
          insert_maildb();
          kill(parentpid, SIGUSR1);
        }
      }
      free(p);

The above code does further processing of the sockets. Once a connection arrives at our desk we must setup some variables and structures and connect to target mail server to start SMTP proxying.

The ability to quickly send traffic back and forth is vital.

See below.

 /* XXX mailacc Email client */
 w = write(mailacc, buf, r);
 p = strdup(buf);
 while ((sep = strsep(&p, "\r\n"))) {
   slashr = strchr(sep, '\r');
   if (slashr)
     *slashr = 0;
   if (isalnum(sep[0]) && !tlsmail)
     syslog(LOG_INFO, "<-- : %s", sep);
 }
 free(p);

 syslog(LOG_INFO, "Wrote %d bytes to client", w);
 if (w == 0 || w == -1) {
   syslog(LOG_INFO, "Mail Client sock closed on write(2)");
   shutdown(mailacc, SHUT_WR);
   close(mailacc);
   shutdown(connfd, SHUT_WR);
   close(connfd);
   pfd[0].fd = -1;
   pfd[0].events = 0;
   if (tlsmail) {
     // insert_maildb(mailsz);
   }
   exit(0);
 }


* XXX mailacc MTA client */
f (pfd[0].revents & POLLIN) {
 r = read(mailacc, buf, sizeof(buf));
 buf[r] = 0;
 if (r == 0 || r == -1) {
   /* XXX MAIL LOOP END */
   syslog(LOG_INFO, 
     "Mail Client sock closed mail process KILLED");
   shutdown(mailacc, SHUT_WR);
   close(mailacc);
   shutdown(connfd, SHUT_WR);
   close(connfd);
   pfd[0].fd = -1;
   pfd[0].events = 0;
   unlink(tmpmailname);
   exit(0);
 }
 syslog(LOG_INFO, "Read %d bytes from Mail client\n", r);
 if (writenext == 0) {
   p = strdup(buf);
   while ((sep = strsep(&p, "\r\n"))) {
     slashr = strchr(sep, '\r');
     if (slashr)
       *slashr = 0;
     if (isalnum(sep[0]) && !tlsmail)
       syslog(LOG_INFO, "--> : %s", sep);
   }
   free(p);
 } else {
   /* The main mail analysis happens here XXX */
   dropmail = 0, insertqua = 0, subfix = 0, badmail = 0;
   /* XXX MAIL LOOP START */
   ret = write_mail_to_disk(connfd, mailacc, buf, r);
   if (ret == -1) {
     syslog(LOG_INFO, "I am now going to DROPMAIL");
     goto DROPMAIL;
   }
   if (ret == 0)
     writenext = 0;
   continue;
 }

 /* This part executes before we start writing mail body */
 if ((p1 = strcasestr(buf, "ehlo")) ||
     (p1 = strcasestr(buf, "helo"))) {
   mark = strchr(p1, ' ');
   upto = strchr(p1, '\r');
   if (upto == NULL)
     syslog(LOG_ERR, "NULL pointer");
   if (upto)
     strncpy(helostring, mark + 1, upto - mark - 1);
   syslog(LOG_INFO, "HELO string is [%s]\n", helostring);
   ret = fqdncheck(helostring);
   if (ret == FQDNREJ) {
     w = write(mailacc, 
     "551 Sorry we accept only FQDN in HELO/EHLO",
               16);
     shutdown(connfd, SHUT_WR);
     close(connfd);
     shutdown(mailacc, SHUT_WR);
     close(mailacc);
     pfd[0].fd = -1;
     pfd[0].events = 0;
     unlink(tmpmailname);
     exit(128);
   }

   ret = helocheck(helostring);
   if (ret == HELOREJ) {
     w = write(mailacc, "551 Invalid HELO", 16);
     shutdown(connfd, SHUT_WR);
     close(connfd);
     shutdown(mailacc, SHUT_WR);
     close(mailacc);
     pfd[0].fd = -1;
     pfd[0].events = 0;
     unlink(tmpmailname);
     exit(128);
   }
 }

 if ((p1 = strcasestr(buf, "starttls"))) {
   tlsmail = 1;
   syslog(LOG_INFO, 
     "[ENCRYPTED] so not logging SMTP handshake");
 }
 if ((p1 = strcasestr(buf, "mail from"))) {
   syslog(LOG_INFO, "Checking for ENV mail from");
   mark = strchr(p1, '<');
   upto = strchr(p1, '>');
   if (upto == NULL)
     syslog(LOG_ERR, "NULL pointer");
   if (upto) {
     strncpy(envfrom, mark + 1, upto - mark);
     envfrom[upto - mark - 1] = 0;
   }
   if (envfrom[0] == 0) {
     syslog(LOG_INFO, "NULL ENVFROM exiting");
     shutdown(connfd, SHUT_WR);
     close(connfd);
     shutdown(mailacc, SHUT_WR);
     close(mailacc);
     pfd[0].fd = -1;
     pfd[0].events = 0;
     exit(128);
   }
   syslog(LOG_INFO, "ENVFROM is [%s]\n", envfrom);

   ret = sendercheck(envfrom);
   if (ret != 0)
     goto DROPMAIL;

   ret = rbl_check_ip();
   if (ret != 0)
     goto DROPMAIL;

   ret = revdns_check();
   if (ret != 0)
     goto DROPMAIL;

   ret = dnschecks();
   if (ret != 0)
     goto DROPMAIL;

   syslog(LOG_INFO, "Checking for RFC 5321 compatibility");
   ret = rfccheck(envfrom);
   if (ret != 0)
     goto DROPMAIL;
 }

 if ((p1 = strcasestr(buf, "rcpt to"))) {
   syslog(LOG_INFO, "Checking for ENV rcpt to");
   mark = strchr(p1, '<');
   upto = strchr(p1, '>');
   if (upto) {
     strncpy(envto, mark + 1, upto - mark);
     envto[upto - mark - 1] = 0;
   }
   syslog(LOG_INFO, "ENVTO is [%s]\n", envto);
   strncpy(child_struct.mailid, envto, strlen(envto));
   strncpy(mail_reports.toid, envto, strlen(envto));
   ret = recipcheck(envto);
   if (ret != 0)
     goto DROPMAIL;
   ret = relaycheck(envto);
   if (ret != 0)
     goto DROPMAIL;
 }
 if (strcasestr(buf, "data\r\n")) {
   mailstate = IN_PROCESS;
   writenext = 1;
   syslog(LOG_INFO, "Mail body found...");
 }

 w = write(connfd, buf, r);
 syslog(LOG_INFO, "Wrote %d bytes to MTA", w);

 if (w == 0 || w == -1) {
   syslog(LOG_INFO, 
     "Outside: Mail Server sock closed on write(2)");
   shutdown(connfd, SHUT_WR);
   close(connfd);
   shutdown(mailacc, SHUT_WR);
   close(mailacc);
   pfd[0].fd = -1;
   pfd[0].events = 0;
   unlink(tmpmailname);
   exit(0);
 }
 /* if(pfd[0].revents & POLLIN)  */
continue;

The above part is quite simple to follow. The heart of the mail proxying happens here wherein we process mail and check for SMTP verbs and so on.

The core spam processing is not shown, that happens in another really long function with calls to various malware, virus and spam check APIs.

The usage of poll(2) for socket multiplexing is the key thing here and obviously we use multi forking model which works really well with concurrent mail invocation.

The way the above code does its job can be described in some simple sentences. The details are in the code of course, but here is a broad overview of what really is going on in the code.

  • We start a poll(2) loop to listen at TCP socket accept(2) call
  • Once a socket connection is established we serve it
  • Serving a connection means proxying traffic to the mail server
  • The processing of the mail header and payloads happen in between
  • The mail handshake between client and server goes through states
  • The DATA state is most significant since till then mail is not actually considered sent by client
  • If data phase ends and mail server sends a Queue ID then mail is considered delivered by mail client

Here is a sample of the DNS checks subsystems.

DNS callbacks

int unpack_data(struct unpack *p, void *data, size_t len) {
  if (p->err)
    return (-1);

  if (p->len - p->offset < len) {
    p->err = "too short";
    return (-1);
  }

  memmove(data, p->buf + p->offset, len);
  p->offset += len;

  return (0);
}

int unpack_u16(struct unpack *p, uint16_t *u16) {
  if (unpack_data(p, u16, 2) == -1)
    return (-1);

  *u16 = ntohs(*u16);

  return (0);
}

int unpack_u32(struct unpack *p, uint32_t *u32) {
  if (unpack_data(p, u32, 4) == -1)
    return (-1);

  *u32 = ntohl(*u32);

  return (0);
}

int unpack_inaddr(struct unpack *p, struct in_addr *a) {
  return (unpack_data(p, a, 4));
}

int unpack_in6addr(struct unpack *p, struct in6_addr *a6) {
  return (unpack_data(p, a6, 16));
}

int unpack_dname(struct unpack *p, char *dst, size_t max) {
  ssize_t e;

  if (p->err)
    return (-1);

  e = dname_expand(p->buf, p->len, p->offset, &p->offset, dst, max);
  if (e == -1) {
    p->err = "bad domain name";
    return (-1);
  }
  if (e < 0 || e > MAXDNAME) {
    p->err = "domain name too long";
    return (-1);
  }

  return (0);
}

int unpack_header(struct unpack *p, struct dns_header *h) {
  if (unpack_data(p, h, HFIXEDSZ) == -1)
    return (-1);

  h->flags = ntohs(h->flags);
  h->qdcount = ntohs(h->qdcount);
  h->ancount = ntohs(h->ancount);
  h->nscount = ntohs(h->nscount);
  h->arcount = ntohs(h->arcount);

  return (0);
}

int unpack_query(struct unpack *p, struct dns_query *q) {
  unpack_dname(p, q->q_dname, sizeof(q->q_dname));
  unpack_u16(p, &q->q_type);
  unpack_u16(p, &q->q_class);

  return (p->err) ? (-1) : (0);
}

int unpack_rr(struct unpack *p, struct dns_rr *rr) {
  uint16_t rdlen;
  size_t save_offset;

  unpack_dname(p, rr->rr_dname, sizeof(rr->rr_dname));
  unpack_u16(p, &rr->rr_type);
  unpack_u16(p, &rr->rr_class);
  unpack_u32(p, &rr->rr_ttl);
  unpack_u16(p, &rdlen);

  if (p->err)
    return (-1);

  if (p->len - p->offset < rdlen) {
    p->err = "too short";
    return (-1);
  }

  save_offset = p->offset;

  switch (rr->rr_type) {

  case T_CNAME:
    unpack_dname(p, rr->rr.cname.cname, sizeof(rr->rr.cname.cname));
    break;

  case T_MX:
    unpack_u16(p, &rr->rr.mx.preference);
    unpack_dname(p, rr->rr.mx.exchange, sizeof(rr->rr.mx.exchange));
    break;

  case T_NS:
    unpack_dname(p, rr->rr.ns.nsname, sizeof(rr->rr.ns.nsname));
    break;

  case T_SOA:
    unpack_dname(p, rr->rr.soa.mname, sizeof(rr->rr.soa.mname));
    unpack_dname(p, rr->rr.soa.rname, sizeof(rr->rr.soa.rname));
    unpack_u32(p, &rr->rr.soa.serial);
    unpack_u32(p, &rr->rr.soa.refresh);
    unpack_u32(p, &rr->rr.soa.retry);
    unpack_u32(p, &rr->rr.soa.expire);
    unpack_u32(p, &rr->rr.soa.minimum);
    break;

  case T_A:
    if (rr->rr_class != C_IN)
      goto other;
    unpack_inaddr(p, &rr->rr.in_a.addr);
    break;

  case T_AAAA:
    if (rr->rr_class != C_IN)
      goto other;
    unpack_in6addr(p, &rr->rr.in_aaaa.addr6);
    break;
  default:
  other:
    rr->rr.other.rdata = p->buf + p->offset;
    rr->rr.other.rdlen = rdlen;
    p->offset += rdlen;
  }

  if (p->err)
    return (-1);

  if (p->offset - save_offset != rdlen)
    p->err = "bad dlen";

  return (p->err) ? (-1) : (0);
}

char *print_dname(const char *_dname, char *buf, size_t max) {
  const unsigned char *dname = _dname;
  char *res;
  size_t left, n, count;

  if (_dname[0] == 0) {
    (void)strlcpy(buf, ".", max);
    return buf;
  }

  res = buf;
  left = max - 1;
  for (n = 0; dname[0] && left; n += dname[0]) {
    count = (dname[0] < (left - 1)) ? dname[0] : (left - 1);
    memmove(buf, dname + 1, count);
    dname += dname[0] + 1;
    left -= count;
    buf += count;
    if (left) {
      left -= 1;
      *buf++ = '.';
    }
  }
  buf[0] = 0;

  return (res);
}

ssize_t dname_expand(const unsigned char *data, size_t len, size_t offset,
                     size_t *newoffset, char *dst, size_t max) {
  size_t n, count, end, ptr, start;
  ssize_t res;

  if (offset >= len)
    return (-1);

  res = 0;
  end = start = offset;

  for (; (n = data[offset]);) {
    if ((n & 0xc0) == 0xc0) {
      if (offset + 2 > len)
        return (-1);
      ptr = 256 * (n & ~0xc0) + data[offset + 1];
      if (ptr >= start)
        return (-1);
      if (end < offset + 2)
        end = offset + 2;
      offset = start = ptr;
      continue;
    }
    if (offset + n + 1 > len)
      return (-1);
    if (dst != NULL && max != 0) {
      count = (max < n + 1) ? (max) : (n + 1);
      memmove(dst, data + offset, count);
      dst += count;
      max -= count;
    }
    res += n + 1;
    offset += n + 1;
    if (end < offset)
      end = offset;
  }
  if (end < offset + 1)
    end = offset + 1;

  if (dst != NULL && max != 0)
    dst[0] = 0;
  if (newoffset)
    *newoffset = end;
  return (res + 1);
}

These are the utility functions for DNS, various callbacks and helper functions. The switch case is most interesting since SPF records can be recursive and hence could take long time to resolve. And we must do it at wire speed, so we just can’t wait. We must scramble for speed and we must do whatever we can to gain speed.

That includes running our own DNS resolvers like unbound or knot.

void lookup_record(int type, const char *record, void (*cb)(struct dns_rr *)) {
  struct asr_query *as;

  as = res_query_async(record, C_IN, type, NULL);
  if (as == NULL)
    err(1, "res_query_async");
  event_asr_run(as, dispatch_record, cb);
}

void dispatch_record(struct asr_result *ar, void *arg) {
  void (*cb)(struct dns_rr *) = arg;
  struct unpack pack;
  struct dns_header h;
  struct dns_query q;
  struct dns_rr rr;

  if (ar->ar_h_errno && ar->ar_h_errno != NO_DATA) {
    return;
  }
  unpack_init(&pack, ar->ar_data, ar->ar_datalen);
  unpack_header(&pack, &h);
  unpack_query(&pack, &q);

  for (; h.ancount; h.ancount--) {
    unpack_rr(&pack, &rr);
    cb(&rr);
  }
}

int append_list(char *buf) {
  struct spflist *spf;
  spf = malloc(sizeof(struct spflist));
  strncpy(spf->iprange, buf, sizeof(spf->iprange));
  syslog(LOG_INFO, "Inserting into spf list  %s", spf->iprange);
  SLIST_INSERT_HEAD(&spflist_head, spf, next);
  return (0);
}

void dispatch_txt(struct dns_rr *rr) {
  char buf[512];
  char buf2[512];
  char *in = buf;
  char *argv[512];
  char **ap = argv;

  print_dname(rr->rr.other.rdata, buf, sizeof(buf));
  buf[strlen(buf) - 1] = '\0';
  if (buf[strlen(buf) - 1] == '.')
    buf[strlen(buf) - 1] = '\0';
  if (strncasecmp("v=spf1 ", buf, 7))
    return;

  while ((*ap = strsep(&in, " ")) != NULL) {
    if (strcasecmp(*ap, "v=spf1") == 0)
      continue;

    if (strncasecmp("ip4:", *ap, 4) == 0) {
      if (ip_v4 == 1 || ip_both == 1) {
        syslog(LOG_INFO, "IPV4 %s", *(ap) + 4);
        strlcpy(buf, *(ap) + 4, sizeof(buf));
        append_list(buf);
      }
      continue;
    }
    if (strncasecmp("ip6:", *ap, 4) == 0) {
      if (ip_v6 == 1 || ip_both == 1) {
        syslog(LOG_INFO, "IPV6 %s", *(ap) + 4);
        strlcpy(buf, *(ap) + 4, sizeof(buf));
        append_list(buf);
      }
      continue;
    }
    if (strncasecmp("+ip4:", *ap, 5) == 0) {
      if (ip_v4 == 1 || ip_both == 1) {
        syslog(LOG_INFO, "+IPV4 %s", *(ap) + 5);
        strlcpy(buf, *(ap) + 5, sizeof(buf));
        append_list(buf);
      }
      continue;
    }
    if (strncasecmp("+ip6:", *ap, 5) == 0) {
      if (ip_v6 == 1 || ip_both == 1) {
        syslog(LOG_INFO, "+IPV6 %s", *(ap) + 5);
        strlcpy(buf, *(ap) + 5, sizeof(buf));
        append_list(buf);
      }
      continue;
    }
    if (strncasecmp("include:", *ap, 8) == 0) {
      lookup_record(T_TXT, *(ap) + 8, dispatch_txt);
      continue;
    }
    if (strncasecmp("redirect=", *ap, 9) == 0) {
      lookup_record(T_TXT, *(ap) + 9, dispatch_txt);
      continue;
    }
    if (strcasecmp(*ap, "mx") == 0 || strcasecmp(*ap, "+mx") == 0) {
      print_dname(rr->rr_dname, buf2, sizeof(buf2));
      buf2[strlen(buf2) - 1] = '\0';
      lookup_record(T_MX, buf2, dispatch_mx);
      continue;
    }
    if (strcasecmp(*ap, "a") == 0 || strcasecmp(*ap, "+a") == 0) {
      print_dname(rr->rr_dname, buf2, sizeof(buf2));
      buf2[strlen(buf2) - 1] = '\0';
      lookup_record(T_A, buf2, dispatch_a);
      lookup_record(T_AAAA, buf2, dispatch_aaaa);
      continue;
    }
  }
  *ap = NULL;
}

void dispatch_mx(struct dns_rr *rr) {
  char buf[512];

  print_dname(rr->rr.mx.exchange, buf, sizeof(buf));
  buf[strlen(buf) - 1] = '\0';
  if (buf[strlen(buf) - 1] == '.')
    buf[strlen(buf) - 1] = '\0';
  lookup_record(T_A, buf, dispatch_a);
  lookup_record(T_AAAA, buf, dispatch_aaaa);
}

void dispatch_a(struct dns_rr *rr) {
  char buffer[512];
  const char *ptr;

  if ((ptr = inet_ntop(AF_INET, &rr->rr.in_a.addr, 
	buffer, sizeof buffer))) {
    strlcpy(buffer, ptr, sizeof(buffer));
    append_list(buffer);
  }
}

The above must be clear, if we get a TXT DNS response or IP address or MX record DNS response we get invoked. Usually the functions are small since we don’t have much to do in these places. They normally take microseconds to run.


void dispatch_mxlookup(struct dns_rr *rr) {

  print_dname(rr->rr.mx.exchange, mx, sizeof(mx));
  mx[strlen(mx) - 1] = '\0';
  if (mx[strlen(mx) - 1] == '.')
    mx[strlen(mx) - 1] = '\0';
  if (strlen(mx) > 0) {
    syslog(LOG_INFO, "MX of sender EXISTS::  %s", mx);
	snprintf(mail_reports.generalinfo.mxverdict,
	sizeof(mail_reports.generalinfo.mxverdict), 
	"MX of IP %s is %s\n", envip, mx);
    mxexists = 0;
  } else {
	snprintf(mail_reports.generalinfo.mxverdict,
	sizeof(mail_reports.generalinfo.mxverdict), 
	"Drop mail as MX is absent\n");
    syslog(LOG_INFO, "Drop mail as MX is absent");
    mxexists = -1;
  }
}

void dispatch_score(struct dns_rr *rr) {
  char buffer[512], *p;
  const char *ptr;
  int score = 0;

  if ((ptr = inet_ntop(AF_INET, &rr->rr.in_a.addr, 
	buffer, sizeof buffer))) {
    if ((p = strrchr(ptr, '.')))
      score = strtol(p + 1, NULL, 10);
    senderScore = score;
  }
}

void dispatch_aaaa(struct dns_rr *rr) {
  char buffer[512];
  const char *ptr;

  if ((ptr =
           inet_ntop(AF_INET6, &rr->rr.in_aaaa.addr6, 
	buffer, sizeof buffer))) {
    strlcpy(buffer, ptr, sizeof(buffer));
    append_list(buffer);
  }
}

int is_in_network_v4(struct in_addr *addr, 
	struct in_addr *net, char bits) {
  if (bits == 0) {
    return -1;
  }
  return !((addr->s_addr ^ net->s_addr) & 
	htonl(0xFFFFFFFFu << (32 - bits)));
}

int is_in_network_v6(const struct in6_addr *network,
       const struct in6_addr *mask, const struct in6_addr *ip) {
  unsigned int i;
  for (i = 0; i < sizeof(struct in6_addr) / sizeof(int); i++) {
    if (((((int *)ip)[i] & ((int *)mask)[i])) !=
        (((int *)network)[i] & ((int *)mask)[i]))
      return 0;
  }
  return 1;
}

int check_valid_spf(char *needle, char *haystack) {
  int match = -1, v6match = -1;
  char maskbits, *p, nw[1024], *maskv6;
  struct sockaddr_in6 ipv6, netmask, network;
  struct in_addr ipv4, net;

  if (!strchr(needle, ':')) {
    syslog(LOG_INFO, "IPV4 match spf");
    /* XXX IPV4 */
    p = strchr(haystack, '/');
    if (p) {
      maskbits = strtol(p + 1, NULL, 10);
      strlcpy(nw, haystack, p - haystack + 1);
    } else {
      maskbits = 32;
      strlcpy(nw, haystack, sizeof(nw));
    }

    inet_aton(needle, &ipv4);
    inet_aton(nw, &net);

    match = is_in_network_v4(&ipv4, &net, maskbits);
    if (match) {
      // printf("match\n");
      return (0);
    }
    // printf("SPF reject\n");
    return (-1);
  } else {
    syslog(LOG_INFO, "IPV6 match spf");
    /* XXX IPV6 */
    // char *ipStr = "2001:db8:8714:3a90::12";
    // char *networkStr = "2002:db8:8714:3a90::";
    // char *netmaskStr = "ffff:ffff:ffff:ffff::";
    maskv6 = strchr(haystack, '/');
    maskv6 += 1;
    strlcpy(nw, haystack, maskv6 - haystack + 1);

    inet_pton(AF_INET6, needle, &(ipv6.sin6_addr));
    inet_pton(AF_INET6, maskv6, &(netmask.sin6_addr));
    inet_pton(AF_INET6, nw, &(network.sin6_addr));

    v6match = is_in_network_v6(&(network.sin6_addr), 
	&(netmask.sin6_addr), &(ipv6.sin6_addr));
    if (v6match) {
      printf("IPV6 match\n");
      return (0);
    }
    return (-1);
  }
}

This code above is most critical. The SPF network range match code is in here. It is very interesting since many spam mails are stopped by SPF match alone and with cloud provider based mailing systems plenty of SPF CIDR ranges exist and we must quickly lookup if an IP belongs to the domain it represents.

Once again we do this at wire speed at blink of an eye.


int revdns_check() {
  struct sockaddr_in sa;
  int ret = -1;
  sa.sin_addr.s_addr = inet_addr(envip);
  sa.sin_family = AF_INET;
  ret = getnameinfo((struct sockaddr *)&sa, 	
	sizeof(sa), revdns, sizeof(revdns),
                    NULL, 0, NI_NAMEREQD);
  if (ret) {
    syslog(LOG_ERR, "Reject:: Could not get reverse hostname");
    reason = REVDNS_REJECT;
    return (-1);
  }
  syslog(LOG_INFO, "Reverse DNS Host of %s is %s", envip, revdns);

	snprintf(mail_reports.generalinfo.revdnsverdict,
	sizeof(mail_reports.generalinfo.revdnsverdict), 
	"Reverse DNS works of IP is %s\n", revdns);


  if (!strcmp(revdns, helostring)) {
    syslog(LOG_ERR, "HELO string Matches reverse DNS!!!");
	snprintf(mail_reports.generalinfo.heloverdict,
	sizeof(mail_reports.generalinfo.heloverdict), 
	"Reverse DNS matches HELO %s\n", helostring);
    return (0);
  }
  reason = HELOREJ;
  syslog(LOG_ERR, "HELO string does not match reverse DNS");

	snprintf(mail_reports.generalinfo.heloverdict,
	sizeof(mail_reports.generalinfo.heloverdict), 
	"HELO string does not match reverse DNS\n");
  return (-1);
}

int mxlookup(char *dom) {
  syslog(LOG_INFO, "Checking MX of sender... %s", dom);
  lookup_record(T_MX, dom, dispatch_mxlookup);
  return (0);
}

int senderscore() {
  char *sep, *p;
  char oct[4][4];
  int octcnt = 0;

  syslog(LOG_INFO, "Checking sender score of sending IP ");
  p = strndup(envip, 100);
  while ((sep = strsep(&p, "."))) {
    if (strlen(sep) == 0)
      continue;
    strlcpy(oct[octcnt], sep, 4);
    octcnt++;
  }
  free(p);
  snprintf(scoreQuery, 1024, 
	"%s.%s.%s.%s.score.senderscore.com", oct[3],
           oct[2], oct[1], oct[0]);
  lookup_record(T_A, scoreQuery, dispatch_score);

  return (0);
}

int spfquery(char *dom) {

  ip_both = 0;
  ip_v4 = 1;

  syslog(LOG_INFO, "Checking SPF of sending domain %s", dom);
  lookup_record(T_TXT, dom, dispatch_txt);
  return (0);
}

The above code does reverse DNS, sender score and so on. This mostly finishes our DNS heavylifting.

The below code is the entry point for everything DNS related you read above. Here is where the game starts. And finishes.


int dnschecks() {
  struct spflist *spf;
  char *dom;
  int spfmatch = -1;
  dom = strchr(envfrom, '@');
  dom += 1;
  event_init();

  senderscore();
  mxlookup(dom);
  spfquery(dom);

  syslog(LOG_INFO, "DNS despatch begin...");
  event_dispatch();
  syslog(LOG_INFO, "DNS despatch END...analysing results");

  /* XXX sometimes you don't have a score, and that is ok */
  if (senderScore < 70 && (senderScore != 0)) {
    syslog(LOG_INFO, 
	"Rejecting Sender score of %d which is < 70 ", senderScore);
	snprintf(mail_reports.generalinfo.scoreverdict,
	sizeof(mail_reports.generalinfo.scoreverdict), 
	"Rejecting Sender score of %d which is < 70 \n", senderScore);
    reason = SENDER_SCORE_TOO_LOW;
    return -1;
  }
  if(!senderScore) {
	snprintf(mail_reports.generalinfo.scoreverdict,
	sizeof(mail_reports.generalinfo.scoreverdict), 
	"No sender score assigned to %s\n", envip);
  	syslog(LOG_INFO, "No sender score assigned to %s", envip);
  }

  syslog(LOG_INFO, "Mail accepted for sender score %d", senderScore);

  if (mxexists == -1) {
    syslog(LOG_INFO, "Sender domain has no MX");
    reason = SENDER_NO_MX;
    return -1;
  }

  syslog(LOG_INFO, "SPF results iterate");
  SLIST_FOREACH(spf, &spflist_head, next) {
    syslog(LOG_INFO, "Checking if SPF is valid %s", spf->iprange);
    spfmatch = check_valid_spf(envip, spf->iprange);
    if (!spfmatch) {
      syslog(LOG_INFO, "SPF matches...%s %s", envip, spf->iprange);
      snprintf(mail_reports.generalinfo.spfverdict,
	sizeof(mail_reports.generalinfo.spfverdict), 
	"SPF matches...Our IP: %s SPF:%s\n", envip, spf->iprange);

      syslog(LOG_INFO, "SPF matches...%s %s", envip, spf->iprange);
      break;
    }
  }
  while (!SLIST_EMPTY(&spflist_head)) {
    spf = SLIST_FIRST(&spflist_head);
    SLIST_REMOVE_HEAD(&spflist_head, next);
    free(spf);
  }

  if (!spfmatch) {
    return 0;
  } else {
    snprintf(mail_reports.generalinfo.spfverdict,
	sizeof(mail_reports.generalinfo.spfverdict), 
	"SPF mismatch...\n");
    syslog(LOG_INFO, "SPF mismatch...%s %s", envip, envfrom);
    return -1;
  }
  return 0;
}

Explanation of the above DNS code

This code can broadly be catergorized into the following:

  • MX lookup callback
  • SPF recursive lookup
  • Sender score lookup
  • Other DNS callbacks
  • Each callback is invoked asynchronously
  • Purpose of this model is performance at cost of complexity
  • The above C code is how node.js works in today’s world

Can the sub systems work independently?

SpamCheetah is designed in such a way that each part works with others in a tightly coupled way at the same time maintaining the independence of functionality according to old school UNIX tradition.

Does it support modules?

Not yet. There are no plans either.

Other languages?

Yes SpamCheetah has test suite written in pytest.

Then entire backend including backend daemon is in perl.

Is it completely written in C?

No. SpamCheetah’s main mail loop in written in C.Rest are in various languages , test suite is in python, the backend in perl and frontend in Angular.