Monday, September 14, 2015

Beaglebone PRU DDR memory access

Here's some C and PRU assembly code I wrote to see how fast the PRU can write to system (DDR) memory.


 // Loads a .bin file into a BeagleBone PRU and then interacts with it  
 // in shared PRU memory and (system-wide) DDR memory.  
 //  
 // Pass in the filename of the .bin file on the command line, eg:  
 // $ ./pru_loader foo.bin  
 //  
 // Compile with:  
 // gcc -std=gnu99 -o pru_loader pru_loader.c -lprussdrv  
   
 #include <unistd.h>  
 #include <stdio.h>  
 #include <inttypes.h>  
 #include <prussdrv.h>  
 #include <pruss_intc_mapping.h>  
   
 int main(int argc, char **argv) {  
  if (argc != 2) {  
   printf("Usage: %s pru_code.bin\n", argv[0]);  
   return 1;  
  }  
    
  // If this segfaults, make sure you're executing as root.  
  prussdrv_init();  
  if (prussdrv_open(PRU_EVTOUT_0) == -1) {  
   printf("prussdrv_open() failed\n");  
   return 1;  
  }  
    
  tpruss_intc_initdata pruss_intc_initdata = PRUSS_INTC_INITDATA;  
  prussdrv_pruintc_init(&pruss_intc_initdata);  
   
  // Pointer into the 8KB of shared PRU DRAM   
  volatile void *shared_memory_void = NULL;  
  // Useful if we're storing data there in 4-byte chunks  
  volatile uint32_t *shared_memory = NULL;  
  prussdrv_map_prumem(PRUSS0_SHARED_DATARAM, (void **) &shared_memory_void);  
  shared_memory = (uint32_t *) shared_memory_void;  
   
  // Pointer into the DDR RAM mapped by the uio_pruss kernel module.  
  volatile void *shared_ddr = NULL;  
  prussdrv_map_extmem((void **) &shared_ddr);  
  unsigned int shared_ddr_len = prussdrv_extmem_size();  
  unsigned int physical_address = prussdrv_get_phys_addr((void *) shared_ddr);  
   
  printf("%u bytes of shared DDR available.\n Physical (PRU-side) address:%x\n",  
      shared_ddr_len, physical_address);  
  printf("Virtual (linux-side) address: %p\n\n", shared_ddr);  
   
  // We'll use the first 8 bytes of PRU memory to tell it where the  
  // shared segment of system memory is.  
  shared_memory[0] = physical_address;  
  shared_memory[1] = shared_ddr_len;  
   
  // Change to 0 to use PRU0  
  int which_pru = 1;  
  prussdrv_exec_program(which_pru, argv[1]);  
   
  for (int i = 0; i < 10; i++) {  
   sleep(1);  
   // See if it's successfully writing the physical address of each word at  
   // the (virtual, from our viewpoint) address  
   printf("DDR[%d] is: %p / 0x%x\n", i, ((unsigned int *)shared_ddr) + i,   
       ((unsigned int *) shared_ddr)[i]);  
   
   int passes = shared_memory[0];  
   int bytes_written = passes * shared_ddr_len;  
   printf("Bytes written: %d\n", bytes_written);  
  }  
   
  // Wait for the PRU to let us know it's done  
  prussdrv_pru_wait_event(PRU_EVTOUT_0);  
  printf("All done\n");  
   
  prussdrv_pru_disable(which_pru);  
  prussdrv_exit();  
   
  return 0;  
 }  
   

And here's the assembly:
 .origin 0  
 .entrypoint TOP  
   
 #define DDR r29  
 #define DDR_SIZE r28  
 #define SHARED_RAM r27  
   
 #define SHARED_RAM_ADDRESS 0x10000  
   
 TOP:  
  // Enable OCP master ports in SYSCFG register  
  LBCO r0, C4, 4, 4  
  CLR r0, r0, 4  
  SBCO r0, C4, 4, 4  
   
  MOV SHARED_RAM, SHARED_RAM_ADDRESS  
   
  // From shared RAM, grab the address of the shared DDR segment  
  LBBO DDR, SHARED_RAM, 0, 4  
  // And the size of the segment from SHARED_RAM + 4  
  LBBO DDR_SIZE, SHARED_RAM, 4, 4  
   
  // BIGLOOP is one pass overwriting the shared DDR memory segment  
  mov r12, 0  
  mov r14, 10000  
 BIGLOOP:  
    
  // Start at the beginning of the segment  
  MOV r10, DDR  
  ADD r11, DDR, DDR_SIZE  
   
  // Tight loop writing the physical address of each word into that word  
 LOOP0:  
  SBBO r10, r10, 0, 4  
  ADD r10, r10, 4  
  // XXX: This means r10 < r11, opposite what I expected!  
  QBLT LOOP0, r11, r10  
   
  ADD r12, r12, 1  
  SBBO r12, SHARED_RAM, 0, 4  
  QBGT BIGLOOP, r12, r14  
    
  // Interrupt the host so it knows we're done  
  MOV r31.b0, 19 + 16  
   
 // Don't forget to halt!   
 HALT  
   

Here's the output I get, about 200MB/sec:

 262144 bytes of shared DDR available.  
  Physical (PRU-side) address:9e6c0000  
 Virtual (linux-side) address: 0xb6d78000  
   
 DDR[0] is: 0xb6d78000 / 0x9e6c0000  
 Bytes written: 200540160  
 DDR[1] is: 0xb6d78004 / 0x9e6c0004  
 Bytes written: 401342464  
 DDR[2] is: 0xb6d78008 / 0x9e6c0008  
 Bytes written: 601882624  
 DDR[3] is: 0xb6d7800c / 0x9e6c000c  
 Bytes written: 802160640  
 DDR[4] is: 0xb6d78010 / 0x9e6c0010  
 Bytes written: 1002176512  
 DDR[5] is: 0xb6d78014 / 0x9e6c0014  
 Bytes written: 1202454528  
 DDR[6] is: 0xb6d78018 / 0x9e6c0018  
 Bytes written: 1402470400  
 DDR[7] is: 0xb6d7801c / 0x9e6c001c  
 Bytes written: 1602748416  
 DDR[8] is: 0xb6d78020 / 0x9e6c0020  
 Bytes written: 1802764288  
 DDR[9] is: 0xb6d78024 / 0x9e6c0024  
 Bytes written: 2003042304  
 All done  
   


If I crank up the number of bytes written by SBBO from 4 to 8 (in the SBBO and ADD after LOOP0), then I think it ends up writing the contents of r10 and r11 into memory, and I get 320MB/sec.  If I crank it up to 16 bytes per write, I get 450MB/sec.

So the PRU really can write very quickly to system RAM.

No comments: